Source

"""This is a recursive web crawler. Don't go pointing this at random sites;it doesn't respect robots.txt and it is pretty brutal about how quickly it fetches pages.The code for this is very short; this is perhaps a good indicationthat this is making the most effective use of the primitves at hand.The fetch function does all the work of making http requests,searching for new urls, and dispatching new fetches. The GreenPoolacts as sort of a job coordinator (and concurrency controller ofcourse)."""from__future__importwith_statementfromeventlet.greenimporturllib2importeventletimportre# http://daringfireball.net/2009/11/liberal_regex_for_matching_urlsurl_regex=re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')deffetch(url,seen,pool):"""Fetch a url, stick any found urls into the seen set, and dispatch any new ones to the pool."""print"fetching",urldata=''witheventlet.Timeout(5,False):data=urllib2.urlopen(url).read()forurl_matchinurl_regex.finditer(data):new_url=url_match.group(0)# only send requests to eventlet.net so as not to destroy the internetifnew_urlnotinseenand'eventlet.net'innew_url:seen.add(new_url)# while this seems stack-recursive, it's actually not:# spawned greenthreads start their own stackspool.spawn_n(fetch,new_url,seen,pool)defcrawl(start_url):"""Recursively crawl starting from *start_url*. Returns a set of urls that were found."""pool=eventlet.GreenPool()seen=set()fetch(start_url,seen,pool)pool.waitall()returnseenseen=crawl("http://eventlet.net")print"I saw these urls:"print"\n".join(seen)