#!/usr/bin/perl# se.pl by Davide +mala Eynard, 2010-2011# run me as "perl se.pl <testsearch.json" - if you have downloaded me from# the right place you also know what that json file is ;-) , otherwise check# the related post on http://davide.eynard.ituseWWW::Mechanize;useURI::Escape;useJSON;useData::Dumper;useutf8;$|=1;my$SLEEP=2;my$AUTOCHECK=0;my$USE_PROXY=0;# if yahoo bans you, set this to 1 ;-)my$PROXY="http://localhost:8118";# I use tor to anonymize my connections...my$DEBUG=1;my$PAGENUM=3;# ----------------------------------------------------------------------------#my%domainlist;# list of gathered domain namesmy%urllist;# list of gathered URL names (weighted according to their occurrences)my%engines;# list of engine profilesmy$results_total=0;my$results_unique=0;my$results_domain=0;my%results_hash;# ----------------------------------------------------------------------------my$json='';# read json from standard input (should we read it from a file?)while(<>){$json.=$_;}# read the json configuration (passed from the cmdline)if(!defined($json)or$jsoneq""){dieWithErr("Where is the json?");}my$jsonobj=from_json($json);my$mech=newWWW::Mechanize();$mech->{autocheck}=$AUTOCHECK;if($USE_PROXY){$mech->proxy(['http','ftp'],$PROXY);}# build the engines hash$engines=${from_json($json)}{'engines'};foreach$searchkey(keys%{$jsonobj->{'search'}}){my$search=${$jsonobj->{'search'}}{$searchkey};# get results from browserforeach$engine(keys%$engines){my$engineId=$$engines{$engine}{'id'};print"[i] Searching $search on $engine... "if$DEBUG;$mech->get($$engines{$engine}{'url'});my%fields=%{$$engines{$engine}{'fields'}};foreach$fk(keys%fields){$fields{$fk}=~s/\$search/$search/g;}printDumper(\%fields)."\n"if$DEBUG;$res=$mech->submit_form(fields=>\%fields);if(!$res->is_success()){dieWithErr("Probs contacting $engine\n");}print"done.\n"if$DEBUG;my$i=0;# $results_partial contains the number of results for each enginemy$results_partial=0;while($res&&$res->is_success()){print"I have downloaded ".$res->base()."\n"if$DEBUG;# get sleep time from config, if it's not present take the defaultmy$sleeptime=($$engines{$engine}{'sleep'}?$$engines{$engine}{'sleep'}:$SLEEP);sleep$sleeptime;# parse results from engine$i++;print" [i] Parsing page $i...\n"if$DEBUG;my$content=$mech->content();my$j=-1;while($content=~/$$engines{$engine}{'regexp'}/gsi){$j++;$results_partial++;my$referer=$1;$urllist{$referer}++;if($referer=~/(http:\/\/[^\/]+)\//si){$mydomain=$1;}my$rank=($i-1)*10+$j;print"$engineId : $search : $mydomain : $referer\n"if$DEBUG;# domain has to be the key, so:# 1) Increment domain->results$results_hash{$mydomain}{'results'}++;# 2) Add the engine to the domain engines list$results_hash{$mydomain}{'engines'}{$engineId}++;# 3) Add URL->{engine, rank}my%small_hash;$small_hash{"engineId"}=$engineId;$small_hash{"rank"}=$rank;$small_hash{"termId"}=$searchkey;push@{$results_hash{$mydomain}{'urls'}{$referer}},\%small_hash;}lastif($i>=$PAGENUM);# get next page from engineif(defined($$engines{$engine}{'nextURL'})){print"Using filter on URL too: ".$$engines{$engine}{'nextURL'}."\n"if$DEBUG;$res=$mech->follow_link(text_regex=>qr/$$engines{$engine}{'next'}/,url_abs_regex=>qr/$$engines{$engine}{'nextURL'}/);}else{$res=$mech->follow_link(text_regex=>qr/$$engines{$engine}{'next'}/);}}$engines{$$engine}{'results'}=$results_partial;}}@rankedURLs=reversesort{$urllist{$a}<=>$urllist{$b}}keys%urllist;$output{'status'}="OK";$output{'contents'}{'terms'}=$jsonobj->{'search'};$output{'contents'}{'domains'}=\%results_hash;$output{'contents'}{'urls'}=\@rankedURLs;my$jsonOutput=newJSON;my$jsonResult=$jsonOutput->pretty->encode(\%output);# comment this if you don't want to enable utf8 encodingutf8::encode($jsonResult);print$jsonResult;exit;#----------------------------------------------------------------------------subdieWithErr{my$error=shift;my%results;my$jsonObj=newJSON;$results{status}="ERROR";$results{contents}=$error;print$jsonObj->pretty->encode(\%results);exit;}