Tuesday, November 15, 2016

Socks5 Proxy Scraping Perl Module for proxychains and DNS Leak Prevention

This is a perl module I wrote for scraping Socks5 proxies off websites. It verifies that the proxy is working before adding it to the mysql database. I will be adding support for performing OCR on an image matched by the ip or port regex with the tesseract perl library. I have also written a php frontend, and scripts that run on cron jobs to update the Max Mind geoip database and check the proxies in the mysql database.


# Socks5 proxy scraping module that stores working proxies in mysql database
# © Michael Craze -- http://projectcraze.us.to
#
# Example usage:
# use Proxy_Scraper;
#
# my $db = "proxies";
# my $user = "proxy_db_user";
# my $password = "proxy_db_pass";
#
# my $geoipdb="/home/$USER/code/get_proxies/GeoLiteCity.dat";
#
# # URL to scrape
# my $url = "http://www.some_proxy_site.com/socks5-list/";
#
# # IE7 - some pages print without javascript for IE7
# my $user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)';
#
# # Time in seconds to wait for a response from the site we are accessing via the proxy when checking
# my $time_out = 5;
#
# # Need parens on port_re, not ip_re
# my $ip_re = qr/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/;
# my $port_re = qr/$ip_re:(\d{1,5})/;
#
# my $ps = Proxy_Scraper->new($user,$password,$db,$geoipdb);
# $ps->scrape_url($url,$ip_re,$port_re,$user_agent,$timeout); 
# $ps->close();
#
# The scrape_url() method can be called as many times as wanted on different urls before calling close

#!/usr/bin/perl

package Proxy_Scraper;

use strict;
use warnings;
use Exporter;
use LWP::Simple;
use LWP::UserAgent;
use Data::Dumper;
use Geo::IP;
use Socket;
use DBI;
use DateTime;
use Net::Whois::Raw;
use Net::Ping;

use vars qw($VERSION @ISA @EXPORT);

require Exporter;

$VERSION = 1.000_001;
@ISA = qw(Exporter);
@EXPORT = (); # list functions/variables that modules exports here

my $DEBUG_LEVEL = 0;
my $GET_WHOIS = 0;

# Dump of Max Mind GeoIPCity.dat Record
#$VAR1 = \bless( {
#                   'city' => 'Mountain View',
#                   'country_code3' => 'USA',
#                   'region_name' => 'California',
#                   'country_code' => 'US',
#                   'postal_code' => '94040',
#                   'continent_code' => 'NA',
#                   'metro_code' => 807,
#                   'area_code' => 650,
#                   'country_name' => 'United States',
#                   'longitude' => '-122.0881',
#                   'region' => 'CA',
#                   'latitude' => '37.3845',
#                   'dma_code' => 807
#                 }, 'Geo::IP::Record' );

sub new{
 my $class = shift;
 my $self = {
  user => shift,
  password => shift,
  db => shift,
  geoipdb => shift,
 };

 $self->{dsn} = "DBI:mysql:$self->{db}";

 $self->{gi} = Geo::IP->open($self->{geoipdb}, GEOIP_STANDARD);

 $self->{dbh} = DBI->connect($self->{dsn}, $self->{user}, $self->{password}, {
  PrintError => 0,
  RaiseError => 1,
  AutoCommit => 1,
 });

 bless $self, $class;
 return $self;
}

sub close{
 my ($self) = @_;
 $self->{dbh}->disconnect;
}

sub  trim { $_[0] =~ s/^\s+|\s+$//g; return $_[0]; };

sub is_numeric { $_ =~ m/^\d+$/ ? return 1 : return 0; };

sub get_current_date_time{
 my $dt = DateTime->now;
 return join ' ', $dt->ymd, $dt->hms;  
}

# reverse dns lookup
sub get_dns{ return gethostbyaddr(inet_aton($_[0]), AF_INET); };

# Check that the proxy is up and working
sub check_proxy{
 my ($self, $ip, $port) = @_;
 my $ua = new LWP::UserAgent(agent => $self->{user_agent});
 $ua->timeout($self->{time_out});
 $ua->proxy([qw(http https)] => "socks://$ip:$port");
 my $res = $ua->get("http://google.com");
 if($DEBUG_LEVEL){
  print "\nWhile checking proxy got: " . $res->code . " " . $res->message . "\n";
 }
 $res->code eq "200" ? return 1 : return 0;
}

sub ping{
 my $hostname = shift;
 my $p = Net::Ping->new();
 my $n = 2;
 my $time = 0;
 my $success = 0;
 if($DEBUG_LEVEL){
  print "Pinging $hostname $n times.\n";
 }
 foreach my $c (1 .. $n) {
  my ($ret, $duration, $ip) = $p->ping($hostname);
  if ($ret) {
   $success++;
   $time += $duration;
  }
 }
 if (not $success) {
  if($DEBUG_LEVEL){
   print "All $n pings failed.\n";
  }
  return 0;
 } 
 else {
  if ($success < $n) {
   my $i = ($n - $success);
   print $i . " lost packets. Packet loss ratio: " . int(100 * ($i / $n)) . "\n";
   return int(100 * ($n - $success) / $n);
  }
  if($DEBUG_LEVEL){
   print "Average round trip: " . ($time / $success) . "\n";
  }
  return ($time / $success);
 }
}

sub traceroute{
 my $host = shift;
 my $tr = Net::Traceroute->new(host => $host);
 if($tr->found) {
  my $hops = $tr->hops;
  if($hops > 1) {
   return "Router was " .
    $tr->hop_query_host($tr->hops - 1, 0) . "\n";
  }
  else{
   return "1 or less hops\n";
  }
 }
 else{
  return "No route found.\n";
 }
}

sub get_whois_str{
 $Net::Whois::Raw::CHECK_FAIL = 1;
 return whois($_[0]);
}

# Builds a hash table of proxies we have already found so we don't add them twice
sub get_proxies_from_db{
 my $self = shift;
 my %seen = %{$_[0]};
 my $sql = 'SELECT ip, port FROM proxies';
 my $sth = $self->{dbh}->prepare($sql);
 my $rv = $sth->execute();
 if($rv < 0){
  print STDERR $DBI::errstr;
 }
 while(my @row = $sth->fetchrow_array) {
  my $ip = $row[0];
  my $port = $row[1];
  $seen{$ip} = $port;
 }
 if($DEBUG_LEVEL >= 2){
  print Dumper \%seen;
 }
}

# Checks if scraped proxy is already in our database
sub proxy_in_db{
 my $ip = shift;
 my $port = shift;
 my %seen = %{$_[0]};
 for my $key (keys %seen){
  if($ip eq $key && $port eq $seen{$key}){
   return 1;
  }
 }
 return 0;
}

sub store_proxy_and_whois{
 my $self = shift;
 my $sth = $self->{dbh}->prepare("INSERT INTO proxies (ip, port, dns, country_code, country_name, region_name, city, postal_code, area_code, latitude, longitude, whois, ping, added, last_checked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
 my $rv = $sth->execute($_[0], $_[1], $_[2], $_[3], $_[4], $_[5], $_[6], $_[7], $_[8], $_[9], $_[10], $_[11], $_[12], $_[13], $_[14]);
 if($rv < 0){
  print STDERR $DBI::errstr;
 }
}

sub store_proxy{
 my $self = shift;
 my $sth = $self->{dbh}->prepare("INSERT INTO proxies (ip, port, dns, country_code, country_name, region_name, city, postal_code, area_code, latitude, longitude, whois, ping, added, last_checked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
 my $rv = $sth->execute($_[0], $_[1], $_[2], $_[3], $_[4], $_[5], $_[6], $_[7], $_[8], $_[9], $_[10], $_[11], $_[12], $_[13]);
 if($rv < 0){
  print STDERR $DBI::errstr;
 }
}

sub print_proxy_csv{
 print join(',',@_) . "\n";
}

sub scrape_url{
 my $self = shift;
 $self->{url} = shift;
 $self->{ip_re} = shift;
 $self->{port_re} = shift;
 $self->{user_agent} = shift;
 $self->{time_out} = shift;
 my %seen = {};
 $self->get_proxies_from_db(\%seen);
 my $ua = new LWP::UserAgent(agent => $self->{user_agent});
 $ua->timeout($self->{time_out});
 my $res = $ua->get($self->{url});
 if($res->code eq "200"){
  if($DEBUG_LEVEL >= 3){
   print $res->decoded_content;
  }
  my @ips = $res->decoded_content =~ m/($self->{ip_re})/gi;
  my @ports = $res->decoded_content =~ m/$self->{port_re}/gi;
  my $i=0;
  foreach my $ip (@ips){
   my @csv_items = ();
   my $port = $ports[$i];
   if(proxy_in_db($ip,$port,\%seen)){
    print "$ip:$port Already Seen.\n";
    last;
   }
  
   # Max Mind GeoIP record
   my $r = $self->{gi}->record_by_addr($ip);
   
   if($self->check_proxy($ip,$port)){
    my $ping = ping($ip);
    my $dns = get_dns($ip);
    my $whois_data = "";
    if($GET_WHOIS){
     $whois_data = get_whois_str($dns);
    }
    push(@csv_items,$ip,$port,$dns,$r->country_code,$r->country_name,$r->region_name,$r->city,$r->postal_code,$r->area_code,$r->latitude,$r->longitude);
    print_proxy_csv(@csv_items);
    
    my $now = get_current_date_time();

    if(defined $whois_data){
     $self->store_proxy_and_whois($ip, $port, $dns, $r->country_code, $r->country_name, $r->region_name, $r->city, $r->postal_code, $r->area_code, $r->latitude, $r->longitude, $whois_data, $ping, $now, $now);
    }
    else{
     $self->store_proxy($ip, $port, $dns, $r->country_code, $r->country_name, $r->region_name, $r->city, $r->postal_code, $r->area_code, $r->latitude, $r->longitude, $ping, $now, $now);
    }
   }
   else{
    print "$ip:$port is down.\n";
   }
   if($i > 0 && $i % 100 == 0){
    print "\n";
   }
   $i++;
  }
  return 0;
 }
 else{
  print STDERR "Couldn't get url ($self->{url}): " . $res->code . " " . $res->message . "\n";
  return 1;
 }
}

1;

No comments:

Post a Comment