# Socks5 proxy scraping module that stores working proxies in mysql database
# © Michael Craze -- http://projectcraze.us.to
#
# Example usage:
# use Proxy_Scraper;
#
# my $db = "proxies";
# my $user = "proxy_db_user";
# my $password = "proxy_db_pass";
#
# my $geoipdb="/home/$USER/code/get_proxies/GeoLiteCity.dat";
#
# # URL to scrape
# my $url = "http://www.some_proxy_site.com/socks5-list/";
#
# # IE7 - some pages print without javascript for IE7
# my $user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)';
#
# # Time in seconds to wait for a response from the site we are accessing via the proxy when checking
# my $time_out = 5;
#
# # Need parens on port_re, not ip_re
# my $ip_re = qr/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/;
# my $port_re = qr/$ip_re:(\d{1,5})/;
#
# my $ps = Proxy_Scraper->new($user,$password,$db,$geoipdb);
# $ps->scrape_url($url,$ip_re,$port_re,$user_agent,$timeout);
# $ps->close();
#
# The scrape_url() method can be called as many times as wanted on different urls before calling close
#!/usr/bin/perl
package Proxy_Scraper;
use strict;
use warnings;
use Exporter;
use LWP::Simple;
use LWP::UserAgent;
use Data::Dumper;
use Geo::IP;
use Socket;
use DBI;
use DateTime;
use Net::Whois::Raw;
use Net::Ping;
use vars qw($VERSION @ISA @EXPORT);
require Exporter;
$VERSION = 1.000_001;
@ISA = qw(Exporter);
@EXPORT = (); # list functions/variables that modules exports here
my $DEBUG_LEVEL = 0;
my $GET_WHOIS = 0;
# Dump of Max Mind GeoIPCity.dat Record
#$VAR1 = \bless( {
# 'city' => 'Mountain View',
# 'country_code3' => 'USA',
# 'region_name' => 'California',
# 'country_code' => 'US',
# 'postal_code' => '94040',
# 'continent_code' => 'NA',
# 'metro_code' => 807,
# 'area_code' => 650,
# 'country_name' => 'United States',
# 'longitude' => '-122.0881',
# 'region' => 'CA',
# 'latitude' => '37.3845',
# 'dma_code' => 807
# }, 'Geo::IP::Record' );
sub new{
my $class = shift;
my $self = {
user => shift,
password => shift,
db => shift,
geoipdb => shift,
};
$self->{dsn} = "DBI:mysql:$self->{db}";
$self->{gi} = Geo::IP->open($self->{geoipdb}, GEOIP_STANDARD);
$self->{dbh} = DBI->connect($self->{dsn}, $self->{user}, $self->{password}, {
PrintError => 0,
RaiseError => 1,
AutoCommit => 1,
});
bless $self, $class;
return $self;
}
sub close{
my ($self) = @_;
$self->{dbh}->disconnect;
}
sub trim { $_[0] =~ s/^\s+|\s+$//g; return $_[0]; };
sub is_numeric { $_ =~ m/^\d+$/ ? return 1 : return 0; };
sub get_current_date_time{
my $dt = DateTime->now;
return join ' ', $dt->ymd, $dt->hms;
}
# reverse dns lookup
sub get_dns{ return gethostbyaddr(inet_aton($_[0]), AF_INET); };
# Check that the proxy is up and working
sub check_proxy{
my ($self, $ip, $port) = @_;
my $ua = new LWP::UserAgent(agent => $self->{user_agent});
$ua->timeout($self->{time_out});
$ua->proxy([qw(http https)] => "socks://$ip:$port");
my $res = $ua->get("http://google.com");
if($DEBUG_LEVEL){
print "\nWhile checking proxy got: " . $res->code . " " . $res->message . "\n";
}
$res->code eq "200" ? return 1 : return 0;
}
sub ping{
my $hostname = shift;
my $p = Net::Ping->new();
my $n = 2;
my $time = 0;
my $success = 0;
if($DEBUG_LEVEL){
print "Pinging $hostname $n times.\n";
}
foreach my $c (1 .. $n) {
my ($ret, $duration, $ip) = $p->ping($hostname);
if ($ret) {
$success++;
$time += $duration;
}
}
if (not $success) {
if($DEBUG_LEVEL){
print "All $n pings failed.\n";
}
return 0;
}
else {
if ($success < $n) {
my $i = ($n - $success);
print $i . " lost packets. Packet loss ratio: " . int(100 * ($i / $n)) . "\n";
return int(100 * ($n - $success) / $n);
}
if($DEBUG_LEVEL){
print "Average round trip: " . ($time / $success) . "\n";
}
return ($time / $success);
}
}
sub traceroute{
my $host = shift;
my $tr = Net::Traceroute->new(host => $host);
if($tr->found) {
my $hops = $tr->hops;
if($hops > 1) {
return "Router was " .
$tr->hop_query_host($tr->hops - 1, 0) . "\n";
}
else{
return "1 or less hops\n";
}
}
else{
return "No route found.\n";
}
}
sub get_whois_str{
$Net::Whois::Raw::CHECK_FAIL = 1;
return whois($_[0]);
}
# Builds a hash table of proxies we have already found so we don't add them twice
sub get_proxies_from_db{
my $self = shift;
my %seen = %{$_[0]};
my $sql = 'SELECT ip, port FROM proxies';
my $sth = $self->{dbh}->prepare($sql);
my $rv = $sth->execute();
if($rv < 0){
print STDERR $DBI::errstr;
}
while(my @row = $sth->fetchrow_array) {
my $ip = $row[0];
my $port = $row[1];
$seen{$ip} = $port;
}
if($DEBUG_LEVEL >= 2){
print Dumper \%seen;
}
}
# Checks if scraped proxy is already in our database
sub proxy_in_db{
my $ip = shift;
my $port = shift;
my %seen = %{$_[0]};
for my $key (keys %seen){
if($ip eq $key && $port eq $seen{$key}){
return 1;
}
}
return 0;
}
sub store_proxy_and_whois{
my $self = shift;
my $sth = $self->{dbh}->prepare("INSERT INTO proxies (ip, port, dns, country_code, country_name, region_name, city, postal_code, area_code, latitude, longitude, whois, ping, added, last_checked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
my $rv = $sth->execute($_[0], $_[1], $_[2], $_[3], $_[4], $_[5], $_[6], $_[7], $_[8], $_[9], $_[10], $_[11], $_[12], $_[13], $_[14]);
if($rv < 0){
print STDERR $DBI::errstr;
}
}
sub store_proxy{
my $self = shift;
my $sth = $self->{dbh}->prepare("INSERT INTO proxies (ip, port, dns, country_code, country_name, region_name, city, postal_code, area_code, latitude, longitude, whois, ping, added, last_checked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
my $rv = $sth->execute($_[0], $_[1], $_[2], $_[3], $_[4], $_[5], $_[6], $_[7], $_[8], $_[9], $_[10], $_[11], $_[12], $_[13]);
if($rv < 0){
print STDERR $DBI::errstr;
}
}
sub print_proxy_csv{
print join(',',@_) . "\n";
}
sub scrape_url{
my $self = shift;
$self->{url} = shift;
$self->{ip_re} = shift;
$self->{port_re} = shift;
$self->{user_agent} = shift;
$self->{time_out} = shift;
my %seen = {};
$self->get_proxies_from_db(\%seen);
my $ua = new LWP::UserAgent(agent => $self->{user_agent});
$ua->timeout($self->{time_out});
my $res = $ua->get($self->{url});
if($res->code eq "200"){
if($DEBUG_LEVEL >= 3){
print $res->decoded_content;
}
my @ips = $res->decoded_content =~ m/($self->{ip_re})/gi;
my @ports = $res->decoded_content =~ m/$self->{port_re}/gi;
my $i=0;
foreach my $ip (@ips){
my @csv_items = ();
my $port = $ports[$i];
if(proxy_in_db($ip,$port,\%seen)){
print "$ip:$port Already Seen.\n";
last;
}
# Max Mind GeoIP record
my $r = $self->{gi}->record_by_addr($ip);
if($self->check_proxy($ip,$port)){
my $ping = ping($ip);
my $dns = get_dns($ip);
my $whois_data = "";
if($GET_WHOIS){
$whois_data = get_whois_str($dns);
}
push(@csv_items,$ip,$port,$dns,$r->country_code,$r->country_name,$r->region_name,$r->city,$r->postal_code,$r->area_code,$r->latitude,$r->longitude);
print_proxy_csv(@csv_items);
my $now = get_current_date_time();
if(defined $whois_data){
$self->store_proxy_and_whois($ip, $port, $dns, $r->country_code, $r->country_name, $r->region_name, $r->city, $r->postal_code, $r->area_code, $r->latitude, $r->longitude, $whois_data, $ping, $now, $now);
}
else{
$self->store_proxy($ip, $port, $dns, $r->country_code, $r->country_name, $r->region_name, $r->city, $r->postal_code, $r->area_code, $r->latitude, $r->longitude, $ping, $now, $now);
}
}
else{
print "$ip:$port is down.\n";
}
if($i > 0 && $i % 100 == 0){
print "\n";
}
$i++;
}
return 0;
}
else{
print STDERR "Couldn't get url ($self->{url}): " . $res->code . " " . $res->message . "\n";
return 1;
}
}
1;
Tuesday, November 15, 2016
Socks5 Proxy Scraping Perl Module for proxychains and DNS Leak Prevention
This is a perl module I wrote for scraping Socks5 proxies off websites. It verifies that the proxy is working before adding it to the mysql database. I will be adding support for performing OCR on an image matched by the ip or port regex with the tesseract perl library. I have also written a php frontend, and scripts that run on cron jobs to update the Max Mind geoip database and check the proxies in the mysql database.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment