##!interval=7##!contact=blaz.zupan@fri.uni-lj.sifromOrange.bioimportobiTaxonomy,obiGEOimportcPickleimportreimportftplibimporttimefromdatetimeimportdatetimefromcommonimport*DOMAIN="GEO"GDS_INFO="gds_info.pickled"TITLE="Gene Expression Omnibus data sets information"TAGS=["Gene Expression Omnibus","data sets","GEO","GDS"]FTP_NCBI="ftp.ncbi.nih.gov"NCBI_DIR="pub/geo/DATA/SOFT/GDS"force_update=False# check if the DOMAIN/files are already on the server, else, createifDOMAINnotinsf_server.listdomains():# DOMAIN does not exist on the server, create itsf_server.create_domain(DOMAIN)localfile=sf_local.localpath(DOMAIN,GDS_INFO)def_create_path_for_file(target):#KEGG uses this!try:os.makedirs(os.path.dirname(target))exceptOSError:passpath=sf_local.localpath(DOMAIN)ifGDS_INFOinsf_server.listfiles(DOMAIN):print"Updating info file from server ..."sf_local.update(DOMAIN,GDS_INFO)info=sf_local.info(DOMAIN,GDS_INFO)gds_info_datetime=datetime.strptime(info["datetime"],"%Y-%m-%d %H:%M:%S.%f")else:print"Creating a local path..."_create_path_for_file(localfile)f=file(localfile,"wb")cPickle.dump(({},{}),f,True)f.close()sf_server.upload(DOMAIN,GDS_INFO,localfile,TITLE,TAGS)sf_server.protect(DOMAIN,GDS_INFO,"0")gds_info_datetime=datetime.fromtimestamp(0)# read the information from the local filegds_info,excluded=cPickle.load(file(localfile,"rb"))# excluded should be a dictionary (GEO_ID, TAX_ID)# if need to refresh the data baseifforce_update:gds_info,excluded=({},{})# list of common organisms may have changed, rescan excluded listexcluded=dict([(id,taxid)forid,taxidinexcluded.items()iftaxidnotinobiTaxonomy.common_taxids()])excluded.update([(id,info["taxid"])forid,infoingds_info.items()ifinfo["taxid"]notinobiTaxonomy.common_taxids()])gds_info=dict([(id,info)forid,infoingds_info.items()ifinfo["taxid"]inobiTaxonomy.common_taxids()])# get the list of GDS files from NCBI directoryprint"Retrieving ftp directory ..."ftp=ftplib.FTP(FTP_NCBI)ftp.login()ftp.cwd(NCBI_DIR)dirlist=[]ftp.dir(dirlist.append)fromdatetimeimportdatetimedefmodified(line):line=line.split()try:date=" ".join(line[5:8]+[str(datetime.today().year)])returndatetime.strptime(date,"%b %d %H:%M %Y")exceptValueError:passtry:date=" ".join(line[5:8])returndatetime.strptime(date,"%b %d %Y")exceptValueError:print"Warning: could not retrieve modified date for\n%s"%linereturndatetime.today()m=re.compile("GDS[0-9]*")gds_names=[(m.search(d).group(0),modified(d))fordindirlistifm.search(d)]#gds_names = [name for name, time_m in gds_names if time_t > gds_info_datetime]#gds_names = [m.search(d).group(0) for d in dirlist if m.search(d)]#gds_names = [name for name in gds_names if not(name in gds_info or name in excluded)]gds_names=[nameforname,time_mingds_namesifnot(nameingds_infoornameinexcluded)ortime_m>gds_info_datetime]skipped=[]iflen(gds_names):forcount,gds_nameinenumerate(gds_names):print"%3d of %3d -- Adding %s ..."%(count+1,len(gds_names),gds_name)try:time.sleep(1)gds=obiGEO.GDS(gds_name)ifgds.info["taxid"]notinobiTaxonomy.common_taxids():excluded[gds_name]=gds.info["taxid"]print"... excluded (%s)."%gds.info["sample_organism"]else:gds_info.update({gds_name:gds.info})f=file(localfile,"wb")cPickle.dump((gds_info,excluded),f,True)f.close()print"... added."exceptException,ex:print"... skipped (error):",str(ex)skipped.append(gds_name)print"Updating %s:%s on the server ..."%(DOMAIN,GDS_INFO)sf_server.upload(DOMAIN,GDS_INFO,localfile,TITLE,TAGS)sf_server.protect(DOMAIN,GDS_INFO,"0")else:print"No update required."printprint"GDS data sets: %d"%len(gds_info)print"Organisms:"organisms=[info["sample_organism"]forinfoingds_info.values()]fororginset(organisms):print" %s (%d)"%(org,organisms.count(org))