1# Copyright 1999 by Jeffrey Chang. All rights reserved. 2# This code is part of the Biopython distribution and governed by its 3# license. Please see the LICENSE file that should have been included 4# as part of this package. 5 6# Patched by Brad Chapman. 7# Chris Wroe added modifications for work in myGrid 8 9"""Code to invoke the NCBI BLAST server over the internet. 10 11This module provides code to work with the WWW version of BLAST 12provided by the NCBI. https://blast.ncbi.nlm.nih.gov/ 13""" 14 15from__future__importprint_function 16 17fromBio._py3kimportStringIO 18fromBio._py3kimport_as_string,_as_bytes 19fromBio._py3kimporturlopenas_urlopen 20fromBio._py3kimporturlencodeas_urlencode 21fromBio._py3kimportRequestas_Request 22 23 24NCBI_BLAST_URL="https://blast.ncbi.nlm.nih.gov/Blast.cgi" 25 26

42"""Do a BLAST search using the QBLAST server at NCBI or a cloud service 43 provider. 44 45 Supports all parameters of the qblast API for Put and Get. 46 47 Please note that BLAST on the cloud supports the NCBI-BLAST Common 48 URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To 49 use this feature, please set url_base to 50 'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and 51 format_object='Alignment'. For more details, please see 52 https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast 53 54 Some useful parameters: 55 56 - program blastn, blastp, blastx, tblastn, or tblastx (lower case) 57 - database Which database to search against (e.g. "nr"). 58 - sequence The sequence to search. 59 - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 60 - descriptions Number of descriptions to show. Def 500. 61 - alignments Number of alignments to show. Def 500. 62 - expect An expect value cutoff. Def 10.0. 63 - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 64 - filter "none" turns off filtering. Default no filtering 65 - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 66 - entrez_query Entrez query to limit Blast search 67 - hitlist_size Number of hits to return. Default 50 68 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) 69 - service plain, psi, phi, rpsblast, megablast (lower case) 70 71 This function does no checking of the validity of the parameters 72 and passes the values to the server as is. More help is available at: 73 http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html 74 75 """ 76importtime 77 78assertprogramin['blastn','blastp','blastx','tblastn','tblastx'] 79 80# Format the "Put" command, which sends search requests to qblast. 81# Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 82# Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 83# To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified 84# (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) 85parameters=[ 86('AUTO_FORMAT',auto_format), 87('COMPOSITION_BASED_STATISTICS',composition_based_statistics), 88('DATABASE',database), 89('DB_GENETIC_CODE',db_genetic_code), 90('ENDPOINTS',endpoints), 91('ENTREZ_QUERY',entrez_query), 92('EXPECT',expect), 93('FILTER',filter), 94('GAPCOSTS',gapcosts), 95('GENETIC_CODE',genetic_code), 96('HITLIST_SIZE',hitlist_size), 97('I_THRESH',i_thresh), 98('LAYOUT',layout), 99('LCASE_MASK',lcase_mask),100('MEGABLAST',megablast),101('MATRIX_NAME',matrix_name),102('NUCL_PENALTY',nucl_penalty),103('NUCL_REWARD',nucl_reward),104('OTHER_ADVANCED',other_advanced),105('PERC_IDENT',perc_ident),106('PHI_PATTERN',phi_pattern),107('PROGRAM',program),108# ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?109('QUERY',sequence),110('QUERY_FILE',query_file),111('QUERY_BELIEVE_DEFLINE',query_believe_defline),112('QUERY_FROM',query_from),113('QUERY_TO',query_to),114# ('RESULTS_FILE',...), - Can we use this parameter?115('SEARCHSP_EFF',searchsp_eff),116('SERVICE',service),117('THRESHOLD',threshold),118('UNGAPPED_ALIGNMENT',ungapped_alignment),119('WORD_SIZE',word_size),120('CMD','Put'),121]122query=[xforxinparametersifx[1]isnotNone]123message=_as_bytes(_urlencode(query))124125# Send off the initial query to qblast.126# Note the NCBI do not currently impose a rate limit here, other127# than the request not to make say 50 queries at once using multiple128# threads.129request=_Request(url_base,130message,131{"User-Agent":"BiopythonClient"})132handle=_urlopen(request)133134# Format the "Get" command, which gets the formatted results from qblast135# Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007136rid,rtoe=_parse_qblast_ref_page(handle)137parameters=[138('ALIGNMENTS',alignments),139('ALIGNMENT_VIEW',alignment_view),140('DESCRIPTIONS',descriptions),141('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),142('EXPECT_LOW',expect_low),143('EXPECT_HIGH',expect_high),144('FORMAT_ENTREZ_QUERY',format_entrez_query),145('FORMAT_OBJECT',format_object),146('FORMAT_TYPE',format_type),147('NCBI_GI',ncbi_gi),148('RID',rid),149('RESULTS_FILE',results_file),150('SERVICE',service),151('SHOW_OVERVIEW',show_overview),152('CMD','Get'),153]154query=[xforxinparametersifx[1]isnotNone]155message=_as_bytes(_urlencode(query))156157# Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait158delay=2.0159previous=time.time()160whileTrue:161current=time.time()162wait=previous+delay-current163ifwait>0:164time.sleep(wait)165previous=current+wait166else:167previous=current168ifdelay+.5*delay<=120:169delay+=.5*delay170else:171delay=120172173request=_Request(url_base,174message,175{"User-Agent":"BiopythonClient"})176handle=_urlopen(request)177results=_as_string(handle.read())178179# Can see an "\n\n" page while results are in progress,180# if so just wait a bit longer...181ifresults=="\n\n":182continue183# XML results don't have the Status tag when finished184if"Status="notinresults:185break186i=results.index("Status=")187j=results.index("\n",i)188status=results[i+len("Status="):j].strip()189ifstatus.upper()=="READY":190break191192returnStringIO(results)

196"""Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).197198 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably199 'Request Time of Execution' and RID would be 'Request Identifier'.200 """201s=_as_string(handle.read())202i=s.find("RID =")203ifi==-1:204rid=None205else:206j=s.find("\n",i)207rid=s[i+len("RID ="):j].strip()208209i=s.find("RTOE =")210ifi==-1:211rtoe=None212else:213j=s.find("\n",i)214rtoe=s[i+len("RTOE ="):j].strip()215216ifnotridandnotrtoe:217# Can we reliably extract the error message from the HTML page?218# e.g. "Message ID#24 Error: Failed to read the Blast query:219# Nucleotide FASTA provided for protein sequence"220# or "Message ID#32 Error: Query contains no data: Query221# contains no sequence data"222#223# This used to occur inside a <div class="error msInf"> entry:224i=s.find('<div class="error msInf">')225ifi!=-1:226msg=s[i+len('<div class="error msInf">'):].strip()227msg=msg.split("</div>",1)[0].split("\n",1)[0].strip()228ifmsg:229raiseValueError("Error message from NCBI: %s"%msg)230# In spring 2010 the markup was like this:231i=s.find('<p class="error">')232ifi!=-1:233msg=s[i+len('<p class="error">'):].strip()234msg=msg.split("</p>",1)[0].split("\n",1)[0].strip()235ifmsg:236raiseValueError("Error message from NCBI: %s"%msg)237# Generic search based on the way the error messages start:238i=s.find('Message ID#')239ifi!=-1:240# Break the message at the first HTML tag241msg=s[i:].split("<",1)[0].split("\n",1)[0].strip()242raiseValueError("Error message from NCBI: %s"%msg)243# We didn't recognise the error layout :(244# print s245raiseValueError("No RID and no RTOE found in the 'please wait' page, "246"there was probably an error in your request but we "247"could not extract a helpful error message.")248elifnotrid:249# Can this happen?250raiseValueError("No RID found in the 'please wait' page."251" (although RTOE = %s)"%repr(rtoe))252elifnotrtoe:253# Can this happen?254raiseValueError("No RTOE found in the 'please wait' page."255" (although RID = %s)"%repr(rid))256257try:258returnrid,int(rtoe)259exceptValueError:260raiseValueError("A non-integer RTOE found in "261"the 'please wait' page, %s"%repr(rtoe))