# Open an arbitrary URL## See the following document for a tentative description of URLs:# Uniform Resource Locators Tim Berners-Lee# INTERNET DRAFT CERN# IETF URL Working Group 14 July 1993# draft-ietf-uri-url-01.txt## The object returned by URLopener().open(file) will differ per# protocol. All you know is that is has methods read(), readline(),# readlines(), fileno(), close() and info(). The read*(), fileno()# and close() methods work like those of open files. # The info() method returns an mimetools.Message object which can be# used to query various info about the object, if available.# (mimetools.Message objects are queried with the getheader() method.)importstringimportsocketimportregeximportos__version__='1.3'# Helper for non-unix systemsifos.name=='mac':frommacurl2pathimporturl2pathname,pathname2urlelifos.name=='nt':fromnturl2pathimporturl2pathname,pathname2urlelse:defurl2pathname(pathname):returnpathnamedefpathname2url(pathname):returnpathname# This really consists of two pieces:# (1) a class which handles opening of all sorts of URLs# (plus assorted utilities etc.)# (2) a set of functions for parsing URLs# XXX Should these be separated out into different modules?# Shortcut for basic usage_urlopener=Nonedefurlopen(url):global_urlopenerifnot_urlopener:_urlopener=FancyURLopener()return_urlopener.open(url)defurlretrieve(url,filename=None):global_urlopenerifnot_urlopener:_urlopener=FancyURLopener()iffilename:return_urlopener.retrieve(url,filename)else:return_urlopener.retrieve(url)defurlcleanup():if_urlopener:_urlopener.cleanup()# Class to open URLs.# This is a class rather than just a subroutine because we may need# more than one set of global protocol-specific options.# Note -- this is a base class for those who don't want the# automatic handling of errors type 302 (relocated) and 401# (authorization needed).ftpcache={}classURLopener:# Constructordef__init__(self,proxies=None):ifproxiesisNone:proxies=getproxies()self.proxies=proxiesserver_version="Python-urllib/%s"%__version__self.addheaders=[('User-agent',server_version)]self.tempcache=None# Undocumented feature: if you assign {} to tempcache,# it is used to cache files retrieved with# self.retrieve(). This is not enabled by default# since it does not work for changing documents (and I# haven't got the logic to check expiration headers# yet).self.ftpcache=ftpcache# Undocumented feature: you can use a different# ftp cache by assigning to the .ftpcache member;# in case you want logically independent URL openersdef__del__(self):self.close()defclose(self):self.cleanup()defcleanup(self):importosifself.tempcache:forurlinself.tempcache.keys():try:os.unlink(self.tempcache[url][0])exceptos.error:passdelself.tempcache[url]# Add a header to be used by the HTTP interface only# e.g. u.addheader('Accept', 'sound/basic')defaddheader(self,*args):self.addheaders.append(args)# External interface# Use URLopener().open(file) instead of open(file, 'r')defopen(self,fullurl):fullurl=unwrap(fullurl)type,url=splittype(fullurl)ifnottype:type='file'ifself.proxies.has_key(type):proxy=self.proxies[type]type,proxy=splittype(proxy)host,selector=splithost(proxy)url=(host,fullurl)# Signal special case to open_*()name='open_'+typeif'-'inname:importregsubname=regsub.gsub('-','_',name)ifnothasattr(self,name):returnself.open_unknown(fullurl)try:returngetattr(self,name)(url)exceptsocket.error,msg:raiseIOError,('socket error',msg)# Overridable interface to open unknown URL typedefopen_unknown(self,fullurl):type,url=splittype(fullurl)raiseIOError,('url error','unknown url type',type)# External interface# retrieve(url) returns (filename, None) for a local object# or (tempfilename, headers) for a remote objectdefretrieve(self,url,filename=None):ifself.tempcacheandself.tempcache.has_key(url):returnself.tempcache[url]url1=unwrap(url)ifself.tempcacheandself.tempcache.has_key(url1):self.tempcache[url]=self.tempcache[url1]returnself.tempcache[url1]type,url1=splittype(url1)ifnotfilenameand(nottypeortype=='file'):try:fp=self.open_local_file(url1)delfpreturnurl2pathname(splithost(url1)[1]),NoneexceptIOError,msg:passfp=self.open(url)headers=fp.info()ifnotfilename:importtempfilefilename=tempfile.mktemp()result=filename,headersifself.tempcacheisnotNone:self.tempcache[url]=resulttfp=open(filename,'w')bs=1024*8block=fp.read(bs)whileblock:tfp.write(block)block=fp.read(bs)delfpdeltfpreturnresult# Each method named open_<type> knows how to open that type of URL# Use HTTP protocoldefopen_http(self,url):importhttplibiftype(url)istype(""):host,selector=splithost(url)else:host,selector=urlprint"proxy via http:",host,selectorifnothost:raiseIOError,('http error','no host given')i=string.find(host,'@')ifi>=0:user_passwd,host=host[:i],host[i+1:]else:user_passwd=Noneifuser_passwd:importbase64auth=string.strip(base64.encodestring(user_passwd))else:auth=Noneh=httplib.HTTP(host)h.putrequest('GET',selector)ifauth:h.putheader('Authorization: Basic %s'%auth)forargsinself.addheaders:apply(h.putheader,args)h.endheaders()errcode,errmsg,headers=h.getreply()fp=h.getfile()iferrcode==200:returnaddinfo(fp,headers)else:returnself.http_error(url,fp,errcode,errmsg,headers)# Handle http errors.# Derived class can override this, or provide specific handlers# named http_error_DDD where DDD is the 3-digit error codedefhttp_error(self,url,fp,errcode,errmsg,headers):# First check if there's a specific handler for this errorname='http_error_%d'%errcodeifhasattr(self,name):method=getattr(self,name)result=method(url,fp,errcode,errmsg,headers)ifresult:returnresultreturnself.http_error_default(url,fp,errcode,errmsg,headers)# Default http error handler: close the connection and raises IOErrordefhttp_error_default(self,url,fp,errcode,errmsg,headers):void=fp.read()fp.close()raiseIOError,('http error',errcode,errmsg,headers)# Use Gopher protocoldefopen_gopher(self,url):importgopherlibhost,selector=splithost(url)ifnothost:raiseIOError,('gopher error','no host given')type,selector=splitgophertype(selector)selector,query=splitquery(selector)selector=unquote(selector)ifquery:query=unquote(query)fp=gopherlib.send_query(selector,query,host)else:fp=gopherlib.send_selector(selector,host)returnaddinfo(fp,noheaders())# Use local file or FTP depending on form of URLdefopen_file(self,url):ifurl[:2]=='//':returnself.open_ftp(url)else:returnself.open_local_file(url)# Use local filedefopen_local_file(self,url):host,file=splithost(url)ifnothost:returnaddinfo(open(url2pathname(file),'r'),noheaders())host,port=splitport(host)ifnotportandsocket.gethostbyname(host)in(localhost(),thishost()):file=unquote(file)returnaddinfo(open(url2pathname(file),'r'),noheaders())raiseIOError,('local file error','not on local host')# Use FTP protocoldefopen_ftp(self,url):host,path=splithost(url)ifnothost:raiseIOError,('ftp error','no host given')host,port=splitport(host)user,host=splituser(host)ifuser:user,passwd=splitpasswd(user)else:passwd=Nonehost=socket.gethostbyname(host)ifnotport:importftplibport=ftplib.FTP_PORTpath,attrs=splitattr(path)dirs=string.splitfields(path,'/')dirs,file=dirs[:-1],dirs[-1]ifdirsandnotdirs[0]:dirs=dirs[1:]key=(user,host,port,string.joinfields(dirs,'/'))try:ifnotself.ftpcache.has_key(key):self.ftpcache[key]= \
ftpwrapper(user,passwd,host,port,dirs)ifnotfile:type='D'else:type='I'forattrinattrs:attr,value=splitvalue(attr)ifstring.lower(attr)=='type'and \
valuein('a','A','i','I','d','D'):type=string.upper(value)returnaddinfo(self.ftpcache[key].retrfile(file,type),noheaders())exceptftperrors(),msg:raiseIOError,('ftp error',msg)# Derived class with handlers for errors we can handle (perhaps)classFancyURLopener(URLopener):def__init__(self,*args):apply(URLopener.__init__,(self,)+args)self.auth_cache={}# Default error handling -- don't raise an exceptiondefhttp_error_default(self,url,fp,errcode,errmsg,headers):returnaddinfo(fp,headers)# Error 302 -- relocateddefhttp_error_302(self,url,fp,errcode,errmsg,headers):# XXX The server can force infinite recursion here!ifheaders.has_key('location'):newurl=headers['location']elifheaders.has_key('uri'):newurl=headers['uri']else:returnvoid=fp.read()fp.close()returnself.open(newurl)# Error 401 -- authentication required# See this URL for a description of the basic authentication scheme:# http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txtdefhttp_error_401(self,url,fp,errcode,errmsg,headers):ifheaders.has_key('www-authenticate'):stuff=headers['www-authenticate']p=regex.compile('[ \t]*\([^ \t]+\)[ \t]+realm="\([^"]*\)"')ifp.match(stuff)>=0:scheme,realm=p.group(1,2)ifstring.lower(scheme)=='basic':returnself.retry_http_basic_auth(url,realm)defretry_http_basic_auth(self,url,realm):host,selector=splithost(url)i=string.find(host,'@')+1host=host[i:]user,passwd=self.get_user_passwd(host,realm,i)ifnot(userorpasswd):returnNonehost=user+':'+passwd+'@'+hostnewurl='//'+host+selectorreturnself.open_http(newurl)defget_user_passwd(self,host,realm,clear_cache=0):key=realm+'@'+string.lower(host)ifself.auth_cache.has_key(key):ifclear_cache:delself.auth_cache[key]else:returnself.auth_cache[key]user,passwd=self.prompt_user_passwd(host,realm)ifuserorpasswd:self.auth_cache[key]=(user,passwd)returnuser,passwddefprompt_user_passwd(self,host,realm):# Override this in a GUI environment!try:user=raw_input("Enter username for %s at %s: "%(realm,host))self.echo_off()try:passwd=raw_input("Enter password for %s in %s at %s: "%(user,realm,host))finally:self.echo_on()returnuser,passwdexceptKeyboardInterrupt:returnNone,Nonedefecho_off(self):importosos.system("stty -echo")defecho_on(self):importosprintos.system("stty echo")# Utility functions# Return the IP address of the magic hostname 'localhost'_localhost=Nonedeflocalhost():global_localhostifnot_localhost:_localhost=socket.gethostbyname('localhost')return_localhost# Return the IP address of the current host_thishost=Nonedefthishost():global_thishostifnot_thishost:_thishost=socket.gethostbyname(socket.gethostname())return_thishost# Return the set of errors raised by the FTP class_ftperrors=Nonedefftperrors():global_ftperrorsifnot_ftperrors:importftplib_ftperrors=(ftplib.error_reply,ftplib.error_temp,ftplib.error_perm,ftplib.error_proto)return_ftperrors# Return an empty mimetools.Message object_noheaders=Nonedefnoheaders():global_noheadersifnot_noheaders:importmimetoolsimportStringIO_noheaders=mimetools.Message(StringIO.StringIO(),0)_noheaders.fp.close()# Recycle file descriptorreturn_noheaders# Utility classes# Class used by open_ftp() for cache of open FTP connectionsclassftpwrapper:def__init__(self,user,passwd,host,port,dirs):self.user=unquote(useror'')self.passwd=unquote(passwdor'')self.host=hostself.port=portself.dirs=[]fordirindirs:self.dirs.append(unquote(dir))self.init()definit(self):importftplibself.ftp=ftplib.FTP()self.ftp.connect(self.host,self.port)self.ftp.login(self.user,self.passwd)fordirinself.dirs:self.ftp.cwd(dir)defretrfile(self,file,type):importftplibiftypein('d','D'):cmd='TYPE A';isdir=1else:cmd='TYPE '+type;isdir=0try:self.ftp.voidcmd(cmd)exceptftplib.all_errors:self.init()self.ftp.voidcmd(cmd)conn=Noneiffileandnotisdir:try:cmd='RETR '+fileconn=self.ftp.transfercmd(cmd)exceptftplib.error_perm,reason:ifreason[:3]!='550':raiseIOError,('ftp error',reason)ifnotconn:# Try a directory listingiffile:cmd='LIST '+fileelse:cmd='LIST'conn=self.ftp.transfercmd(cmd)returnaddclosehook(conn.makefile('rb'),self.ftp.voidresp)# Base class for addinfo and addclosehookclassaddbase:def__init__(self,fp):self.fp=fpself.read=self.fp.readself.readline=self.fp.readlineself.readlines=self.fp.readlinesself.fileno=self.fp.filenodef__repr__(self):return'<%s at %s whose fp = %s>'%(self.__class__.__name__,`id(self)`,`self.fp`)defclose(self):self.read=Noneself.readline=Noneself.readlines=Noneself.fileno=Noneifself.fp:self.fp.close()self.fp=None# Class to add a close hook to an open fileclassaddclosehook(addbase):def__init__(self,fp,closehook,*hookargs):addbase.__init__(self,fp)self.closehook=closehookself.hookargs=hookargsdefclose(self):ifself.closehook:apply(self.closehook,self.hookargs)self.closehook=Noneself.hookargs=Noneaddbase.close(self)# class to add an info() method to an open fileclassaddinfo(addbase):def__init__(self,fp,headers):addbase.__init__(self,fp)self.headers=headersdefinfo(self):returnself.headers# Utility to combine a URL with a base URL to form a new URLdefbasejoin(base,url):type,path=splittype(url)iftype:# if url is complete (i.e., it contains a type), return itreturnurlhost,path=splithost(path)type,basepath=splittype(base)# inherit type from baseifhost:# if url contains host, just inherit typeiftype:returntype+'://'+host+pathelse:# no type inherited, so url must have started with //# just return itreturnurlhost,basepath=splithost(basepath)# inherit hostbasepath,basetag=splittag(basepath)# remove extraneuous cruftbasepath,basequery=splitquery(basepath)# idemifpath[:1]!='/':# non-absolute path nameifpath[:1]in('#','?'):# path is just a tag or query, attach to basepathi=len(basepath)else:# else replace last componenti=string.rfind(basepath,'/')ifi<0:# basepath not absoluteifhost:# host present, make absolutebasepath='/'else:# else keep non-absolutebasepath=''else:# remove last file componentbasepath=basepath[:i+1]path=basepath+pathiftypeandhost:returntype+'://'+host+patheliftype:returntype+':'+pathelifhost:return'//'+host+path# don't know what this meanselse:returnpath# Utilities to parse URLs (most of these return None for missing parts):# unwrap('<URL:type://host/path>') --> 'type://host/path'# splittype('type:opaquestring') --> 'type', 'opaquestring'# splithost('//host[:port]/path') --> 'host[:port]', '/path'# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'# splitpasswd('user:passwd') -> 'user', 'passwd'# splitport('host:port') --> 'host', 'port'# splitquery('/path?query') --> '/path', 'query'# splittag('/path#tag') --> '/path', 'tag'# splitattr('/path;attr1=value1;attr2=value2;...') -># '/path', ['attr1=value1', 'attr2=value2', ...]# splitvalue('attr=value') --> 'attr', 'value'# splitgophertype('/Xselector') --> 'X', 'selector'# unquote('abc%20def') -> 'abc def'# quote('abc def') -> 'abc%20def')defunwrap(url):url=string.strip(url)ifurl[:1]=='<'andurl[-1:]=='>':url=string.strip(url[1:-1])ifurl[:4]=='URL:':url=string.strip(url[4:])returnurl_typeprog=regex.compile('^\([^/:]+\):\(.*\)$')defsplittype(url):if_typeprog.match(url)>=0:return_typeprog.group(1,2)returnNone,url_hostprog=regex.compile('^//\([^/]+\)\(.*\)$')defsplithost(url):if_hostprog.match(url)>=0:return_hostprog.group(1,2)returnNone,url_userprog=regex.compile('^\([^@]*\)@\(.*\)$')defsplituser(host):if_userprog.match(host)>=0:return_userprog.group(1,2)returnNone,host_passwdprog=regex.compile('^\([^:]*\):\(.*\)$')defsplitpasswd(user):if_passwdprog.match(user)>=0:return_passwdprog.group(1,2)returnuser,None_portprog=regex.compile('^\(.*\):\([0-9]+\)$')defsplitport(host):if_portprog.match(host)>=0:return_portprog.group(1,2)returnhost,None# Split host and port, returning numeric port.# Return given default port if no ':' found; defaults to -1.# Return numerical port if a valid number are found after ':'.# Return None if ':' but not a valid number._nportprog=regex.compile('^\(.*\):\(.*\)$')defsplitnport(host,defport=-1):if_nportprog.match(host)>=0:host,port=_nportprog.group(1,2)try:ifnotport:raisestring.atoi_error,"no digits"nport=string.atoi(port)exceptstring.atoi_error:nport=Nonereturnhost,nportreturnhost,defport_queryprog=regex.compile('^\(.*\)\?\([^?]*\)$')defsplitquery(url):if_queryprog.match(url)>=0:return_queryprog.group(1,2)returnurl,None_tagprog=regex.compile('^\(.*\)#\([^#]*\)$')defsplittag(url):if_tagprog.match(url)>=0:return_tagprog.group(1,2)returnurl,Nonedefsplitattr(url):words=string.splitfields(url,';')returnwords[0],words[1:]_valueprog=regex.compile('^\([^=]*\)=\(.*\)$')defsplitvalue(attr):if_valueprog.match(attr)>=0:return_valueprog.group(1,2)returnattr,Nonedefsplitgophertype(selector):ifselector[:1]=='/'andselector[1:2]:returnselector[1],selector[2:]returnNone,selector_quoteprog=regex.compile('%[0-9a-fA-F][0-9a-fA-F]')defunquote(s):i=0n=len(s)res=''while0<=i<n:j=_quoteprog.search(s,i)ifj<0:res=res+s[i:]breakres=res+(s[i:j]+chr(string.atoi(s[j+1:j+3],16)))i=j+3returnresalways_safe=string.letters+string.digits+'_,.-'defquote(s,safe='/'):safe=always_safe+saferes=''forcins:ifcinsafe:res=res+celse:res=res+'%%%02x'%ord(c)returnres# Proxy handlingdefgetproxies():"""Return a dictionary of protocol scheme -> proxy server URL mappings. Scan the environment for variables named <scheme>_proxy; this seems to be the standard convention. If you need a different way, you can pass a proxies dictionary to the [Fancy]URLopener constructor. """proxies={}forname,valueinos.environ.items():ifvalueandname[-6:]=='_proxy':proxies[name[:-6]]=valuereturnproxies# Test and time quote() and unquote()deftest1():importtimes=''foriinrange(256):s=s+chr(i)s=s*4t0=time.time()qs=quote(s)uqs=unquote(qs)t1=time.time()ifuqs!=s:print'Wrong!'print`s`print`qs`print`uqs`printround(t1-t0,3),'sec'# Test programdeftest():importsysimportregsubargs=sys.argv[1:]ifnotargs:args=['/etc/passwd','file:/etc/passwd','file://localhost/etc/passwd','ftp://ftp.cwi.nl/etc/passwd','gopher://gopher.cwi.nl/11/','http://www.cwi.nl/index.html',]try:forurlinargs:print'-'*10,url,'-'*10fn,h=urlretrieve(url)printfn,hifh:print'======'forkinh.keys():printk+':',h[k]print'======'fp=open(fn,'r')data=fp.read()delfpprintregsub.gsub('\r','',data)fn,h=None,Noneprint'-'*40finally:urlcleanup()# Run test program when run as a scriptif__name__=='__main__':## test1()test()