htrc_features.feature_reader module

from__future__importunicode_literalsimportsysPY3=(sys.version_info[0]>=3)importloggingimportpandasaspdimportnumpyasnpimportpymarcfromsiximportiteritems,StringIO,BytesIOtry:importujsonasjsonexceptImportError:importjsonimportrequestsifPY3:fromurllib.requestimporturlopenas_urlopenfromurllib.parseimporturlparseasparse_urlfromurllib.errorimportHTTPErrorelse:fromurlparseimporturlparseasparse_urlfromurllib2importurlopenas_urlopenfromurllib2importHTTPErrortry:importbz2fileasbz2exceptImportError:importbz2ifnotPY3:logging.warn("Loading volumes from a URL will not work in Python 2 unless you install bz2file")# UTILSSECREF=['header','body','footer']defgroup_tokenlist(in_df,pages=True,section='all',case=True,pos=True,page_freq=False):''' Return a token count dataframe with requested folding. pages[bool]: If true, keep pages. If false, combine all pages. section[string]: 'header', 'body', 'footer' will only return those sections. 'all' will return all info, unfolded. 'group' combines all sections info. case[bool]: If true, return case-sensitive token counts. pos[bool]: If true, return tokens facets by part-of-speech. page_freq[bool]: If true, will simply count whether or not a token is on a page. Defaults to false. '''groups=[]ifpages:groups.append('page')ifsectionin['all']+SECREF:groups.append('section')groups.append('token'ifcaseelse'lowercase')ifpos:groups.append('pos')ifin_df.empty:returnpd.DataFrame([],columns=groups)ifsectionin['all','group']:df=in_dfelifsectioninSECREF:idx=pd.IndexSlicetry:df=in_df.loc[idx[:,section,:,:],]exceptKeyError:logging.debug("Section {} not available".format(section))df=pd.DataFrame([],columns=groups+['count'])\
.set_index(groups)returndfelse:logging.error("Invalid section argument: {}".format(section))return# Add lowercase column. Previously, this was saved internally. However,# DataFrame.str.lower() is reasonably fast and the need to call it# repeatedly is low, so it is no longer saved.# This makes the internal representation more predictable, hopefully# avoiding unexpected bugs.ifnotcase:# Replace our df reference to a copy.df=df.copy()logging.debug('Adding lowercase column')df.insert(len(df.columns),'lowercase',df.index.get_level_values('token').str.lower())# Check if we need to group anythingifgroups==['page','section','token','pos']:ifpage_freq:pd.options.mode.chained_assignment=Nonedf['count']=1pd.options.mode.chained_assignment='warn'returndfelse:ifnotpage_freq:returndf.reset_index().groupby(groups).sum()[['count']]elifpage_freqand'page'ingroups:df=df.reset_index().groupby(groups).sum()[['count']]pd.options.mode.chained_assignment=Nonedf['count']=1pd.options.mode.chained_assignment='warn'returndfelifpage_freqand'page'notingroups:# We'll have to group page-level, then group againdefset_to_one(x):x['count']=1returnxreturndf.reset_index().groupby(['page']+groups).apply(set_to_one)\
.groupby(groups).sum()[['count']]defgroup_linechars(df,section='all',place='all'):# Set up groupinggroups=['page']ifsectioninSECREF+['all']:groups.append('section')ifplacein['begin','end','all']:groups.append('place')groups.append('character')# Set up slicingslices=[slice(None)]ifsectionin['all','group']:slices.append(slice(None))elifsectioninSECREF:slices.append([section])ifplacein['begin','end']:slices.append([place])elifplacein['group','all']:# It's hard to imagine a use for place='group', but adding for# completionslices.append(slice(None))ifslices!=[slice(None)]*3:df=df.loc[tuple(slices),]ifgroups==['page','section','place','character']:returndfelse:returndf.groupby(groups).sum()[['count']]# CLASSESclassFeatureReader(object):DL_URL="https://data.analytics.hathitrust.org/features/get?download-id={0}"def__init__(self,paths=None,compressed=True,ids=None):self.compressed=compressedifpaths:self._online=Falseiftype(paths)islist:self.paths=pathselse:self.paths=[paths]else:self.paths=[]ifids:iftype(ids)islist:self.paths+=[self.DL_URL.format(id)foridinids]else:self.paths.append(self.DL_URL.format(ids))self.index=0def__iter__(self):returnself.volumes()def__len__(self):returnlen(self.paths)def__str__(self):return"HTRC Feature Reader with %d paths load"%(len(self.paths))defvolumes(self):''' Generator for returning Volume objects '''forpathinself.paths:# If path is a tuple, assume that the advanced path was also giveniftype(path)==tuple:basic,advanced=pathyieldself._volume(basic,advanced_path=advanced,compressed=self.compressed)else:yieldself._volume(path,compressed=self.compressed)defjsons(self):''' Generator for returning decompressed, parsed json dictionaries for volumes. Convenience function for when the FeatureReader objects are not needed. '''forpathinself.paths:# If path is a tuple, assume that the advanced path was also giveniftype(path)==tuple:basic,advanced=pathbasicjson=self._read_json(basic,compressed=self.compressed)advjson=self._read_json(advanced,compressed=self.compressed)yield(basicjson,advjson)else:yieldself._read_json(path,compressed=self.compressed)deffirst(self):''' Return first volume from Feature Reader. This is a convenience feature for single volume imports or to quickly get a volume for testing.'''returnnext(self.volumes())defcreate_volume(self,path,**kwargs):returnself._volume(path,**kwargs)def_read_json(self,path_or_url,compressed=True,advanced_path=False):''' Load JSON for a path. Allows remote files in addition to local ones. '''ifparse_url(path_or_url).schemein['http','https']:try:req=_urlopen(path_or_url)filename_or_buffer=BytesIO(req.read())exceptHTTPError:logging.exception("HTTP Error with id %s"%path_or_url)raisecompressed=Trueelse:filename_or_buffer=path_or_urltry:ifcompressed:f=bz2.BZ2File(filename_or_buffer)else:f=open(filename_or_buffer,'r+')rawjson=f.readline()f.close()exceptIOError:logging.exception("Can't read %s. Did you pass the incorrect ""'compressed=' argument?",path_or_url)raiseexcept:logging.exception("Can't open %s",path_or_url)raise# This is a bandaid for schema version 2.0, not over-engineered# since upcoming releases of the extracted features# dataset won't keep the basic/advanced splittry:# For Python3 compatibility, decode to str objectiftype(rawjson)!=str:rawjson=rawjson.decode()volumejson=json.loads(rawjson)except:logging.exception("Problem reading JSON for %s. One common reason"" for this error is an incorrect compressed= ""argument",path_or_url)raisereturnvolumejsondef_volume(self,path,compressed=True,advanced_path=False):''' Read a path into a volume.'''volumejson=self._read_json(path,compressed)ifadvanced_path:advanced=self._read_json(advanced_path,compressed)advanced=advanced['features']else:advanced=FalsereturnVolume(volumejson,advanced=advanced)def_wrap_func(self,func):''' Convert a volume path to a volume and run func(vol). For multiprocessing. TODO: Closures won't work, this is a useless function. Remove this after consideration... '''defnew_func(path):vol=self._volume(path)func(vol)returnnew_funcdef__repr__(self):iflen(self.paths)>1:return"<%d path FeatureReader (%s to %s)>"%(len(self.paths),self.paths[0],self.paths[-1])eliflen(self.paths)==1:return"<Empty FeatureReader>"else:return"<FeatureReader for %s>"%self.paths[0]def__str__(self):return"<%d path FeatureReader>"%(len(self.paths))classVolume(object):SUPPORTED_SCHEMA=['2.0','3.0']METADATA_FIELDS=[('schemaVersion','schema_version'),('dateCreated','date_created'),('title','title'),('pubDate','pub_date'),('language','language'),('htBibUrl','ht_bib_url'),('handleUrl','handle_url'),('oclc','oclc'),('imprint','imprint'),('names','names'),('classification','classification'),('typeOfResource','type_of_resource'),('issuance','issuance'),('genre','genre'),("bibliographicFormat","bibliographic_format"),("pubPlace","pub_place"),("governmentDocument","government_document"),("sourceInstitution","source_institution"),("enumerationChronology","enumeration_chronology"),("hathitrustRecordNumber","hathitrust_record_number"),("rightsAttributes","rights_attributes"),("accessProfile","access_profile"),("volumeIdentifier","volume_identifier"),("sourceInstitutionRecordNumber","source_institution_record_number"),("isbn","isbn"),("issn","issn"),("lccn","lccn"),("lastUpdateDate","last_update_date")]''' List of metadata fields, with their pythonic name mapping. '''BASIC_FIELDS=[('pageCount','page_count')]''' List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping. '''_metadata=None_tokencounts=pd.DataFrame()_line_chars=pd.DataFrame()def__init__(self,obj,advanced=False,default_page_section='body'):# Verify schema versionself._schema=obj['features']['schemaVersion']ifself._schemanotinself.SUPPORTED_SCHEMA:logging.warning('Schema version of imported (%s) file does not match ''the supported version (%s)'%(obj['features']['schemaVersion'],self.SUPPORTED_SCHEMA))self.id=obj['id']self._pages=obj['features']['pages']self.default_page_section=default_page_section# Expand basic values to propertiesforkey,pythonkeyinself.METADATA_FIELDS:ifkeyinobj['metadata']:setattr(self,pythonkey,obj['metadata'][key])forkey,pythonkeyinself.BASIC_FIELDS:ifkeyinobj['features']:setattr(self,pythonkey,obj['features'][key])if(hasattr(self,'genre')andobj['metadata']['schemaVersion']in["1.0","2.0"]):self.genre=self.genre.split(", ")self._has_advanced=Falseifadvanced:ifself._schema!='2.0':logging.warning("Only schema 2.0 supports advanced files.""Ignoring")else:self._has_advanced=True# Create an internal dataframe for lineChar countsself._line_chars=self._make_line_char_df(advanced['pages'])if(self._schemain['2.0','3.0'])and(self.languagein['jpn','chi']):logging.warning("This version of the EF dataset has a tokenization bug for Chinese and Japanese.""See https://wiki.htrc.illinois.edu/display/COM/Extracted+Features+Dataset#ExtractedFeaturesDataset-issues")def__iter__(self):returnself.pages()def__str__(self):try:return"<HTRC Volume: %s - %s (%s)>"%(self.id,self.title,self.year)except:return"<HTRC Volume: %s>"%self.iddef_repr_html_(self):html_template="<strong><a href='%s'>%s</a></strong> by <em>%s</em> (%s, %s pages) - <code>%s</code>"try:returnhtml_template%(self.handle_url,self.title,",".join(self.author),self.year,self.page_count,self.id)except:return"<strong><a href='%s'>%s</a></strong>"%(self.handle_url,self.title)@propertydefyear(self):''' A friendlier name wrapping Volume.pubDate '''returnself.pub_date@propertydefauthor(self):''' A friendlier name wrapping Volume.names. Returns list. '''returnself.names@propertydefmetadata(self):""" Fetch additional information about a volume from the HathITrust Bibliographic API. See: https://www.hathitrust.org/bib_api :return: A `pymarc` record. See pymarc's documentation for details on using it. """ifnotself._metadata:logging.debug("Looking up full metadata for {0}".format(self.id))data=requests.get(self.ht_bib_url).json()record_id=data['items'][0]['fromRecord']marc=data['records'][record_id]['marc-xml']# Pymarc only reads a file, so stream the text as if it was onexml_stream=StringIO(marc)xml_record=pymarc.parse_xml_to_array(xml_stream)[0]xml_stream.close()self._metadata=xml_recordreturnself._metadatadeftokens(self,section='default',case=True):''' Get unique tokens '''tokens=self.tokenlist(section=section).index\
.get_level_values('token').to_series()ifcase:returntokens.unique().tolist()else:returntokens.str.lower().unique().tolist()defpages(self,**kwargs):forpageinself._pages:yieldPage(page,self,**kwargs)deftokens_per_page(self,**kwargs):''' Return a one dimension pd.DataFrame of page lengths '''if'section'notinkwargsorkwargs['section']=='default':section=self.default_page_sectionelse:section=kwargs['section']d=[{'page':int(page['seq']),'section':sec,'count':page[sec]['tokenCount']}forpageinself._pagesforsecinSECREF]df=pd.DataFrame(d).set_index(['page','section']).sort_index()ifsectioninSECREF:returndf.loc[(slice(None),section),].reset_index('section',drop=True)elifsection=='all':returndfelifsection=='group':returndf.groupby(level='page').sum()defline_counts(self,section='default'):''' Return a list of line counts, per page '''return[page.line_count(section=section)forpageinself.pages()]defempty_line_counts(self,section='default'):''' Return a list of empty line counts, per page '''return[page.empty_line_count(section=section)forpageinself.pages()]defcap_alpha_seq(self,section='body'):logging.warn("At the volume-level, use Volume.cap_alpha_seqs()")returnself.cap_alpha_seqs(section)defcap_alpha_seqs(self,section='body'):''' Return the longest length of consecutive capital letters starting a line on the page. Returns a list for all pages. Only includes the body: header/footer information is not included. '''ifsection!='body':logging.warn("cap_alpha_seq only includes counts for the body ""section of pages.")return[page.cap_alpha_seq()forpageinself.pages()]defsentence_counts(self,section='default'):''' Return a list of sentence counts, per page '''return[page.sentence_count(section=section)forpageinself.pages()]deftokenlist(self,pages=True,section='default',case=True,pos=True,page_freq=False):''' Get or set tokencounts DataFrame pages[bool]: Keep page-level info if true, else fold. section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word page_freq[bool] : Whether to count page frequency (1 if it occurs on the page, else 0) or a term frequency (counts for the term, per page) '''ifsection=='default':section=self.default_page_section# Create the internal representation if it does not already# exist. This will only need to exist onceifself._tokencounts.empty:self._tokencounts=self._make_tokencount_df(self._pages)returngroup_tokenlist(self._tokencounts,pages=pages,section=section,case=case,pos=pos,page_freq=page_freq)defterm_page_freqs(self,page_freq=True,case=True):''' Return a term frequency x page matrix, or optionally a page frequency x page matrix '''all_page_dfs=self.tokenlist(page_freq=page_freq,case=case)returnall_page_dfs.reset_index()\
.groupby(['token','page'],as_index=False).sum()\
.pivot(index='page',columns='token',values='count')\
.fillna(0)defterm_volume_freqs(self,page_freq=True,pos=True,case=True):''' Return a list of each term's frequency in the entire volume '''df=self.tokenlist(page_freq=page_freq,pos=pos,case=case)groups=['token']ifnotposelse['token','pos']returndf.reset_index().drop(['page'],axis=1)\
.groupby(groups,as_index=False).sum()\
.sort_values(by='count',ascending=False)defend_line_chars(self,**args):''' Get counts of characters at the end of lines, i.e. the characters on the far right of the page. '''returnself.line_chars(place='end',**args)defbegin_line_chars(self,**args):''' Get counts of characters at the begin of lines, i.e. the characters on the far left of the page. '''returnself.line_chars(place='begin',**args)defline_chars(self,section='default',place='all'):''' Interface for all begin/end of line character information. '''ifself._schema=='2.0'andnotself._has_advanced:logging.error("For schema version 2.0, you need load the ""'advanced' file for begin/endLineChars")returnifsection=='default':section=self.default_page_sectionifself._line_chars.emptyandself._has_advanced:logging.error("Something went wrong. Expected Advanced features"" to already be processed")returnelifself._line_chars.emptyandnotself._has_advanced:self._line_chars=self._make_line_char_df(self._pages)df=self._line_charsreturngroup_linechars(df,section=section,place=place)def_make_tokencount_df(self,pages):''' Returns a Pandas dataframe of: page / section / place(i.e. begin/end) / char / count '''ifself._schema=='1.0':tname='tokens'else:tname='tokenPosCount'# Make structured numpy array# Because it is typed, this approach is ~40x faster than earlier# methodsm=sum([page['tokenCount']forpageinpages])arr=np.zeros(m,dtype=[(str('page'),str('u8')),(str('section'),str('U6')),(str('token'),str('U64')),(str('pos'),str('U6')),(str('count'),str('u4'))])i=0forpageinpages:forsecin['header','body','footer']:fortoken,posvaluesiniteritems(page[sec][tname]):forpos,valueiniteritems(posvalues):arr[i]=(page['seq'],sec,token,pos,value)i+=1if(i>m+1):logging.error("This volume has more token info ""than the internal representation ""allows. Email organisciak@gmail.com""to let the library author know!")# Create a DataFramedf=pd.DataFrame(arr[:i]).set_index(['page','section','token','pos'])df.sort_index(inplace=True,level=0,sort_remaining=True)returndfdef_make_line_char_df(self,pages):''' Returns a Pandas dataframe of: page / section / place(i.e. begin/end) / char / count Provide an array of pages that hold beginLineChars and endLineChars. '''ifself._schema=='3.0':logging.warn("Adapted to erroneous key names in schema 3.0.")place_key=[('begin','beginCharCounts'),('end','endCharCount')]else:place_key=[('begin','beginLineChars'),('end','endLineChars')]# Make structured numpy array# Because it is typed, this approach is ~40x faster than earlier# methodsm=len(pages)*3*2# Pages * section types * placesarr=np.zeros(int(m*100),dtype=[(str('page'),str('u8')),(str('section'),str('U6')),(str('place'),str('U5')),(str('char'),str('U1')),(str('count'),str('u8'))])i=0forpageinpages:forsecin['header','body','footer']:forplace,json_keyinplace_key:forchar,valueiniteritems(page[sec][json_key]):arr[i]=(page['seq'],sec,place,char,value)i+=1# Create a DataFramedf=pd.DataFrame(arr[:i]).set_index(['page','section','place','char'])df.sortlevel(inplace=True)returndfdef__str__(self):deftruncate(s,maxlen):iflen(s)>maxlen:returns[:maxlen].strip()+"..."else:returns.strip()return"<Volume: %s (%s) by %s>"%(truncate(self.title,30),self.year,truncate(self.author[0],40))classPage:_tokencounts=pd.DataFrame()_line_chars=pd.DataFrame()BASIC_FIELDS=[('seq','seq'),('tokenCount','_token_count'),('languages','languages')]''' List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping '''SECTION_FIELDS=['lineCount','emptyLineCount','sentenceCount','capAlphaSeq']''' Fields that are counted by section.'''def__init__(self,pageobj,volume,default_section='body'):self.volume=volumeself.default_section=default_sectionself._json=pageobjassert(self.default_sectioninSECREF+['all','group'])forkey,pythonkeyinself.BASIC_FIELDS:ifkeyinpageobj:setattr(self,pythonkey,pageobj[key])arr=np.zeros((len(SECREF),len(self.SECTION_FIELDS)),dtype='u4')fori,secinenumerate(SECREF):forj,statinenumerate(self.SECTION_FIELDS):ifstatinself._json[sec]:arr[i,j]=self._json[sec][stat]else:arr[i,j]=0self._basic_stats=pd.DataFrame(arr,columns=self.SECTION_FIELDS,index=SECREF)deftokens(self,section='default',case=True):''' Get unique tokens '''tokens=self.tokenlist(section=section).index\
.get_level_values('token').to_series()ifcase:returntokens.unique().tolist()else:returntokens.str.lower().unique().tolist()defcount(self):returnself._df['count'].astype(int).sum()defline_count(self,section='default'):returnself._get_basic_stat(section,'lineCount')defempty_line_count(self,section='default'):returnself._get_basic_stat(section,'emptyLineCount')defcap_alpha_seq(self,section='body'):''' Return the longest length of consecutive capital letters starting a line on the page. Returns an integer. Only includes the body: header/footer information is not included. '''ifsection!='body':logging.warn("cap_alpha_seq only includes counts for the body ""section of pages.")returnself._get_basic_stat('body','capAlphaSeq')defsentence_count(self,section='default'):returnself._get_basic_stat(section,'sentenceCount')def_get_basic_stat(self,section,stat):ifstatis'all':# Return all columns. No publicized currentlystat=slice(None)ifsection=='default':section=self.default_sectionifsectionin['header','body','footer']:returnself._basic_stats.loc[section,stat]elifsection=='all':returnself._basic_stats.loc[:,stat]elifsection=='group':returnself._basic_stats.loc[:,stat].sum()deftokenlist(self,section='default',case=True,pos=True):''' Get or set tokencounts DataFrame section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. To save processing, it's likely more efficient to calculate lowercase later in the process: if you want information for all pages, first collect your information case-sensitive, then fold at the end. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word '''section=self.default_sectionifsection=='default'elsesection# If there are no tokens, return an empty dataframeifself._token_count==0:emptycols=['page']ifsectioninSECREF+['all']:emptycols.append('section')emptycols.append('token'ifcaseelse'lowercase')ifpos:emptycols.append('pos')emptycols.append('count')returnpd.DataFrame([],columns=emptycols)# If there's a volume-level representation, simply pull from thatelifnotself.volume._tokencounts.empty:try:df=self.volume._tokencounts.loc[([int(self.seq)]),]except:logging.error("Error subsetting volume DF for seq:{}".format(self.seq))return# Create the internal representation if it does not already# This will only need to be created onceelifself._tokencounts.empty:# Using the DF building method from Volumeself._tokencounts=self.volume._make_tokencount_df([self._json])df=self._tokencountselse:df=self._tokencountsreturngroup_tokenlist(df,pages=True,section=section,case=case,pos=pos)defend_line_chars(self,section='default'):returnself.line_chars(section=section,place='end')defbegin_line_chars(self,section='default'):returnself.line_chars(section=section,place='begin')defline_chars(self,section='default',place='all'):''' Get a dataframe of character counts at the start and end of lines '''section=self.default_sectionifsection=='default'elsesection# If there are no tokens, return an empty dataframeifself._token_count==0:emptycols=['page']ifsectioninSECREF+['all']:emptycols.append('section')ifplacein['begin','end','all']:emptycols.append('place')emptycols.append('character')emptycols.append('count')returnpd.DataFrame([],columns=emptycols)# If there's a volume-level representation, simply pull from thatelifnotself.volume._line_chars.empty:try:self._line_chars=self.volume._line_chars\
.loc[([int(self.seq)]),]except:logging.error("Error subsetting volume DF for seq:{}".format(self.seq))return# Create the internal representation if it does not already exist# Since the code is the same, we'll use the definition from Volumeelifself._line_chars.empty:self._line_chars=self.volume._make_line_char_df(self,[self._json])df=self._line_charsreturngroup_linechars(df,section=section,place=place)deftoken_count(self,section='default'):''' Count total tokens on the page '''returnself.tokenlist(section=section)['count'].sum()def__str__(self):ifself.volume:name="<page %s of volume %s>"%(self.seq,self.volume.id)else:name="<page %s with no volume parent>"%(self.seq)returnname

Module variables

var PY3

var SECREF

Functions

defgroup_linechars(df,section='all',place='all'):# Set up groupinggroups=['page']ifsectioninSECREF+['all']:groups.append('section')ifplacein['begin','end','all']:groups.append('place')groups.append('character')# Set up slicingslices=[slice(None)]ifsectionin['all','group']:slices.append(slice(None))elifsectioninSECREF:slices.append([section])ifplacein['begin','end']:slices.append([place])elifplacein['group','all']:# It's hard to imagine a use for place='group', but adding for# completionslices.append(slice(None))ifslices!=[slice(None)]*3:df=df.loc[tuple(slices),]ifgroups==['page','section','place','character']:returndfelse:returndf.groupby(groups).sum()[['count']]

pages[bool]: If true, keep pages. If false, combine all pages.
section[string]: 'header', 'body', 'footer' will only return those
sections. 'all' will return all info, unfolded. 'group' combines
all sections info.
case[bool]: If true, return case-sensitive token counts.
pos[bool]: If true, return tokens facets by part-of-speech.
page_freq[bool]: If true, will simply count whether or not a token is
on a page. Defaults to false.

defgroup_tokenlist(in_df,pages=True,section='all',case=True,pos=True,page_freq=False):''' Return a token count dataframe with requested folding. pages[bool]: If true, keep pages. If false, combine all pages. section[string]: 'header', 'body', 'footer' will only return those sections. 'all' will return all info, unfolded. 'group' combines all sections info. case[bool]: If true, return case-sensitive token counts. pos[bool]: If true, return tokens facets by part-of-speech. page_freq[bool]: If true, will simply count whether or not a token is on a page. Defaults to false. '''groups=[]ifpages:groups.append('page')ifsectionin['all']+SECREF:groups.append('section')groups.append('token'ifcaseelse'lowercase')ifpos:groups.append('pos')ifin_df.empty:returnpd.DataFrame([],columns=groups)ifsectionin['all','group']:df=in_dfelifsectioninSECREF:idx=pd.IndexSlicetry:df=in_df.loc[idx[:,section,:,:],]exceptKeyError:logging.debug("Section {} not available".format(section))df=pd.DataFrame([],columns=groups+['count'])\
.set_index(groups)returndfelse:logging.error("Invalid section argument: {}".format(section))return# Add lowercase column. Previously, this was saved internally. However,# DataFrame.str.lower() is reasonably fast and the need to call it# repeatedly is low, so it is no longer saved.# This makes the internal representation more predictable, hopefully# avoiding unexpected bugs.ifnotcase:# Replace our df reference to a copy.df=df.copy()logging.debug('Adding lowercase column')df.insert(len(df.columns),'lowercase',df.index.get_level_values('token').str.lower())# Check if we need to group anythingifgroups==['page','section','token','pos']:ifpage_freq:pd.options.mode.chained_assignment=Nonedf['count']=1pd.options.mode.chained_assignment='warn'returndfelse:ifnotpage_freq:returndf.reset_index().groupby(groups).sum()[['count']]elifpage_freqand'page'ingroups:df=df.reset_index().groupby(groups).sum()[['count']]pd.options.mode.chained_assignment=Nonedf['count']=1pd.options.mode.chained_assignment='warn'returndfelifpage_freqand'page'notingroups:# We'll have to group page-level, then group againdefset_to_one(x):x['count']=1returnxreturndf.reset_index().groupby(['page']+groups).apply(set_to_one)\
.groupby(groups).sum()[['count']]

Classes

classFeatureReader(object):DL_URL="https://data.analytics.hathitrust.org/features/get?download-id={0}"def__init__(self,paths=None,compressed=True,ids=None):self.compressed=compressedifpaths:self._online=Falseiftype(paths)islist:self.paths=pathselse:self.paths=[paths]else:self.paths=[]ifids:iftype(ids)islist:self.paths+=[self.DL_URL.format(id)foridinids]else:self.paths.append(self.DL_URL.format(ids))self.index=0def__iter__(self):returnself.volumes()def__len__(self):returnlen(self.paths)def__str__(self):return"HTRC Feature Reader with %d paths load"%(len(self.paths))defvolumes(self):''' Generator for returning Volume objects '''forpathinself.paths:# If path is a tuple, assume that the advanced path was also giveniftype(path)==tuple:basic,advanced=pathyieldself._volume(basic,advanced_path=advanced,compressed=self.compressed)else:yieldself._volume(path,compressed=self.compressed)defjsons(self):''' Generator for returning decompressed, parsed json dictionaries for volumes. Convenience function for when the FeatureReader objects are not needed. '''forpathinself.paths:# If path is a tuple, assume that the advanced path was also giveniftype(path)==tuple:basic,advanced=pathbasicjson=self._read_json(basic,compressed=self.compressed)advjson=self._read_json(advanced,compressed=self.compressed)yield(basicjson,advjson)else:yieldself._read_json(path,compressed=self.compressed)deffirst(self):''' Return first volume from Feature Reader. This is a convenience feature for single volume imports or to quickly get a volume for testing.'''returnnext(self.volumes())defcreate_volume(self,path,**kwargs):returnself._volume(path,**kwargs)def_read_json(self,path_or_url,compressed=True,advanced_path=False):''' Load JSON for a path. Allows remote files in addition to local ones. '''ifparse_url(path_or_url).schemein['http','https']:try:req=_urlopen(path_or_url)filename_or_buffer=BytesIO(req.read())exceptHTTPError:logging.exception("HTTP Error with id %s"%path_or_url)raisecompressed=Trueelse:filename_or_buffer=path_or_urltry:ifcompressed:f=bz2.BZ2File(filename_or_buffer)else:f=open(filename_or_buffer,'r+')rawjson=f.readline()f.close()exceptIOError:logging.exception("Can't read %s. Did you pass the incorrect ""'compressed=' argument?",path_or_url)raiseexcept:logging.exception("Can't open %s",path_or_url)raise# This is a bandaid for schema version 2.0, not over-engineered# since upcoming releases of the extracted features# dataset won't keep the basic/advanced splittry:# For Python3 compatibility, decode to str objectiftype(rawjson)!=str:rawjson=rawjson.decode()volumejson=json.loads(rawjson)except:logging.exception("Problem reading JSON for %s. One common reason"" for this error is an incorrect compressed= ""argument",path_or_url)raisereturnvolumejsondef_volume(self,path,compressed=True,advanced_path=False):''' Read a path into a volume.'''volumejson=self._read_json(path,compressed)ifadvanced_path:advanced=self._read_json(advanced_path,compressed)advanced=advanced['features']else:advanced=FalsereturnVolume(volumejson,advanced=advanced)def_wrap_func(self,func):''' Convert a volume path to a volume and run func(vol). For multiprocessing. TODO: Closures won't work, this is a useless function. Remove this after consideration... '''defnew_func(path):vol=self._volume(path)func(vol)returnnew_funcdef__repr__(self):iflen(self.paths)>1:return"<%d path FeatureReader (%s to %s)>"%(len(self.paths),self.paths[0],self.paths[-1])eliflen(self.paths)==1:return"<Empty FeatureReader>"else:return"<FeatureReader for %s>"%self.paths[0]def__str__(self):return"<%d path FeatureReader>"%(len(self.paths))

defjsons(self):''' Generator for returning decompressed, parsed json dictionaries for volumes. Convenience function for when the FeatureReader objects are not needed. '''forpathinself.paths:# If path is a tuple, assume that the advanced path was also giveniftype(path)==tuple:basic,advanced=pathbasicjson=self._read_json(basic,compressed=self.compressed)advjson=self._read_json(advanced,compressed=self.compressed)yield(basicjson,advjson)else:yieldself._read_json(path,compressed=self.compressed)

defvolumes(self):''' Generator for returning Volume objects '''forpathinself.paths:# If path is a tuple, assume that the advanced path was also giveniftype(path)==tuple:basic,advanced=pathyieldself._volume(basic,advanced_path=advanced,compressed=self.compressed)else:yieldself._volume(path,compressed=self.compressed)

Instance variables

classPage:_tokencounts=pd.DataFrame()_line_chars=pd.DataFrame()BASIC_FIELDS=[('seq','seq'),('tokenCount','_token_count'),('languages','languages')]''' List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping '''SECTION_FIELDS=['lineCount','emptyLineCount','sentenceCount','capAlphaSeq']''' Fields that are counted by section.'''def__init__(self,pageobj,volume,default_section='body'):self.volume=volumeself.default_section=default_sectionself._json=pageobjassert(self.default_sectioninSECREF+['all','group'])forkey,pythonkeyinself.BASIC_FIELDS:ifkeyinpageobj:setattr(self,pythonkey,pageobj[key])arr=np.zeros((len(SECREF),len(self.SECTION_FIELDS)),dtype='u4')fori,secinenumerate(SECREF):forj,statinenumerate(self.SECTION_FIELDS):ifstatinself._json[sec]:arr[i,j]=self._json[sec][stat]else:arr[i,j]=0self._basic_stats=pd.DataFrame(arr,columns=self.SECTION_FIELDS,index=SECREF)deftokens(self,section='default',case=True):''' Get unique tokens '''tokens=self.tokenlist(section=section).index\
.get_level_values('token').to_series()ifcase:returntokens.unique().tolist()else:returntokens.str.lower().unique().tolist()defcount(self):returnself._df['count'].astype(int).sum()defline_count(self,section='default'):returnself._get_basic_stat(section,'lineCount')defempty_line_count(self,section='default'):returnself._get_basic_stat(section,'emptyLineCount')defcap_alpha_seq(self,section='body'):''' Return the longest length of consecutive capital letters starting a line on the page. Returns an integer. Only includes the body: header/footer information is not included. '''ifsection!='body':logging.warn("cap_alpha_seq only includes counts for the body ""section of pages.")returnself._get_basic_stat('body','capAlphaSeq')defsentence_count(self,section='default'):returnself._get_basic_stat(section,'sentenceCount')def_get_basic_stat(self,section,stat):ifstatis'all':# Return all columns. No publicized currentlystat=slice(None)ifsection=='default':section=self.default_sectionifsectionin['header','body','footer']:returnself._basic_stats.loc[section,stat]elifsection=='all':returnself._basic_stats.loc[:,stat]elifsection=='group':returnself._basic_stats.loc[:,stat].sum()deftokenlist(self,section='default',case=True,pos=True):''' Get or set tokencounts DataFrame section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. To save processing, it's likely more efficient to calculate lowercase later in the process: if you want information for all pages, first collect your information case-sensitive, then fold at the end. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word '''section=self.default_sectionifsection=='default'elsesection# If there are no tokens, return an empty dataframeifself._token_count==0:emptycols=['page']ifsectioninSECREF+['all']:emptycols.append('section')emptycols.append('token'ifcaseelse'lowercase')ifpos:emptycols.append('pos')emptycols.append('count')returnpd.DataFrame([],columns=emptycols)# If there's a volume-level representation, simply pull from thatelifnotself.volume._tokencounts.empty:try:df=self.volume._tokencounts.loc[([int(self.seq)]),]except:logging.error("Error subsetting volume DF for seq:{}".format(self.seq))return# Create the internal representation if it does not already# This will only need to be created onceelifself._tokencounts.empty:# Using the DF building method from Volumeself._tokencounts=self.volume._make_tokencount_df([self._json])df=self._tokencountselse:df=self._tokencountsreturngroup_tokenlist(df,pages=True,section=section,case=case,pos=pos)defend_line_chars(self,section='default'):returnself.line_chars(section=section,place='end')defbegin_line_chars(self,section='default'):returnself.line_chars(section=section,place='begin')defline_chars(self,section='default',place='all'):''' Get a dataframe of character counts at the start and end of lines '''section=self.default_sectionifsection=='default'elsesection# If there are no tokens, return an empty dataframeifself._token_count==0:emptycols=['page']ifsectioninSECREF+['all']:emptycols.append('section')ifplacein['begin','end','all']:emptycols.append('place')emptycols.append('character')emptycols.append('count')returnpd.DataFrame([],columns=emptycols)# If there's a volume-level representation, simply pull from thatelifnotself.volume._line_chars.empty:try:self._line_chars=self.volume._line_chars\
.loc[([int(self.seq)]),]except:logging.error("Error subsetting volume DF for seq:{}".format(self.seq))return# Create the internal representation if it does not already exist# Since the code is the same, we'll use the definition from Volumeelifself._line_chars.empty:self._line_chars=self.volume._make_line_char_df(self,[self._json])df=self._line_charsreturngroup_linechars(df,section=section,place=place)deftoken_count(self,section='default'):''' Count total tokens on the page '''returnself.tokenlist(section=section)['count'].sum()def__str__(self):ifself.volume:name="<page %s of volume %s>"%(self.seq,self.volume.id)else:name="<page %s with no volume parent>"%(self.seq)returnname

defcap_alpha_seq(self,section='body'):''' Return the longest length of consecutive capital letters starting a line on the page. Returns an integer. Only includes the body: header/footer information is not included. '''ifsection!='body':logging.warn("cap_alpha_seq only includes counts for the body ""section of pages.")returnself._get_basic_stat('body','capAlphaSeq')

defline_chars(self,section='default',place='all'):''' Get a dataframe of character counts at the start and end of lines '''section=self.default_sectionifsection=='default'elsesection# If there are no tokens, return an empty dataframeifself._token_count==0:emptycols=['page']ifsectioninSECREF+['all']:emptycols.append('section')ifplacein['begin','end','all']:emptycols.append('place')emptycols.append('character')emptycols.append('count')returnpd.DataFrame([],columns=emptycols)# If there's a volume-level representation, simply pull from thatelifnotself.volume._line_chars.empty:try:self._line_chars=self.volume._line_chars\
.loc[([int(self.seq)]),]except:logging.error("Error subsetting volume DF for seq:{}".format(self.seq))return# Create the internal representation if it does not already exist# Since the code is the same, we'll use the definition from Volumeelifself._line_chars.empty:self._line_chars=self.volume._make_line_char_df(self,[self._json])df=self._line_charsreturngroup_linechars(df,section=section,place=place)

deftoken_count(self,section='default'):''' Count total tokens on the page '''returnself.tokenlist(section=section)['count'].sum()

def tokenlist(

self, section='default', case=True, pos=True)

Get or set tokencounts DataFrame

section[string]: Which part of the page to return. In addition to
'header', 'body', and 'footer', 'all' will return a DataFrame with
all the sections, 'group' will sum all sections,
section in ['header', 'footer', 'body'] will return those fields
section == 'all' will group and sum all sections
section == 'default' falls back on what the page object has saved

case[bool] : Preserve case, or fold. To save processing, it's likely
more efficient to calculate lowercase later in the process:
if you want information for all pages, first collect your
information case-sensitive, then fold at the end.

pos[bool] : Specify whether to return frequencies per part of speech,
or simply by word

deftokenlist(self,section='default',case=True,pos=True):''' Get or set tokencounts DataFrame section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. To save processing, it's likely more efficient to calculate lowercase later in the process: if you want information for all pages, first collect your information case-sensitive, then fold at the end. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word '''section=self.default_sectionifsection=='default'elsesection# If there are no tokens, return an empty dataframeifself._token_count==0:emptycols=['page']ifsectioninSECREF+['all']:emptycols.append('section')emptycols.append('token'ifcaseelse'lowercase')ifpos:emptycols.append('pos')emptycols.append('count')returnpd.DataFrame([],columns=emptycols)# If there's a volume-level representation, simply pull from thatelifnotself.volume._tokencounts.empty:try:df=self.volume._tokencounts.loc[([int(self.seq)]),]except:logging.error("Error subsetting volume DF for seq:{}".format(self.seq))return# Create the internal representation if it does not already# This will only need to be created onceelifself._tokencounts.empty:# Using the DF building method from Volumeself._tokencounts=self.volume._make_tokencount_df([self._json])df=self._tokencountselse:df=self._tokencountsreturngroup_tokenlist(df,pages=True,section=section,case=case,pos=pos)

Instance variables

classVolume(object):SUPPORTED_SCHEMA=['2.0','3.0']METADATA_FIELDS=[('schemaVersion','schema_version'),('dateCreated','date_created'),('title','title'),('pubDate','pub_date'),('language','language'),('htBibUrl','ht_bib_url'),('handleUrl','handle_url'),('oclc','oclc'),('imprint','imprint'),('names','names'),('classification','classification'),('typeOfResource','type_of_resource'),('issuance','issuance'),('genre','genre'),("bibliographicFormat","bibliographic_format"),("pubPlace","pub_place"),("governmentDocument","government_document"),("sourceInstitution","source_institution"),("enumerationChronology","enumeration_chronology"),("hathitrustRecordNumber","hathitrust_record_number"),("rightsAttributes","rights_attributes"),("accessProfile","access_profile"),("volumeIdentifier","volume_identifier"),("sourceInstitutionRecordNumber","source_institution_record_number"),("isbn","isbn"),("issn","issn"),("lccn","lccn"),("lastUpdateDate","last_update_date")]''' List of metadata fields, with their pythonic name mapping. '''BASIC_FIELDS=[('pageCount','page_count')]''' List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping. '''_metadata=None_tokencounts=pd.DataFrame()_line_chars=pd.DataFrame()def__init__(self,obj,advanced=False,default_page_section='body'):# Verify schema versionself._schema=obj['features']['schemaVersion']ifself._schemanotinself.SUPPORTED_SCHEMA:logging.warning('Schema version of imported (%s) file does not match ''the supported version (%s)'%(obj['features']['schemaVersion'],self.SUPPORTED_SCHEMA))self.id=obj['id']self._pages=obj['features']['pages']self.default_page_section=default_page_section# Expand basic values to propertiesforkey,pythonkeyinself.METADATA_FIELDS:ifkeyinobj['metadata']:setattr(self,pythonkey,obj['metadata'][key])forkey,pythonkeyinself.BASIC_FIELDS:ifkeyinobj['features']:setattr(self,pythonkey,obj['features'][key])if(hasattr(self,'genre')andobj['metadata']['schemaVersion']in["1.0","2.0"]):self.genre=self.genre.split(", ")self._has_advanced=Falseifadvanced:ifself._schema!='2.0':logging.warning("Only schema 2.0 supports advanced files.""Ignoring")else:self._has_advanced=True# Create an internal dataframe for lineChar countsself._line_chars=self._make_line_char_df(advanced['pages'])if(self._schemain['2.0','3.0'])and(self.languagein['jpn','chi']):logging.warning("This version of the EF dataset has a tokenization bug for Chinese and Japanese.""See https://wiki.htrc.illinois.edu/display/COM/Extracted+Features+Dataset#ExtractedFeaturesDataset-issues")def__iter__(self):returnself.pages()def__str__(self):try:return"<HTRC Volume: %s - %s (%s)>"%(self.id,self.title,self.year)except:return"<HTRC Volume: %s>"%self.iddef_repr_html_(self):html_template="<strong><a href='%s'>%s</a></strong> by <em>%s</em> (%s, %s pages) - <code>%s</code>"try:returnhtml_template%(self.handle_url,self.title,",".join(self.author),self.year,self.page_count,self.id)except:return"<strong><a href='%s'>%s</a></strong>"%(self.handle_url,self.title)@propertydefyear(self):''' A friendlier name wrapping Volume.pubDate '''returnself.pub_date@propertydefauthor(self):''' A friendlier name wrapping Volume.names. Returns list. '''returnself.names@propertydefmetadata(self):""" Fetch additional information about a volume from the HathITrust Bibliographic API. See: https://www.hathitrust.org/bib_api :return: A `pymarc` record. See pymarc's documentation for details on using it. """ifnotself._metadata:logging.debug("Looking up full metadata for {0}".format(self.id))data=requests.get(self.ht_bib_url).json()record_id=data['items'][0]['fromRecord']marc=data['records'][record_id]['marc-xml']# Pymarc only reads a file, so stream the text as if it was onexml_stream=StringIO(marc)xml_record=pymarc.parse_xml_to_array(xml_stream)[0]xml_stream.close()self._metadata=xml_recordreturnself._metadatadeftokens(self,section='default',case=True):''' Get unique tokens '''tokens=self.tokenlist(section=section).index\
.get_level_values('token').to_series()ifcase:returntokens.unique().tolist()else:returntokens.str.lower().unique().tolist()defpages(self,**kwargs):forpageinself._pages:yieldPage(page,self,**kwargs)deftokens_per_page(self,**kwargs):''' Return a one dimension pd.DataFrame of page lengths '''if'section'notinkwargsorkwargs['section']=='default':section=self.default_page_sectionelse:section=kwargs['section']d=[{'page':int(page['seq']),'section':sec,'count':page[sec]['tokenCount']}forpageinself._pagesforsecinSECREF]df=pd.DataFrame(d).set_index(['page','section']).sort_index()ifsectioninSECREF:returndf.loc[(slice(None),section),].reset_index('section',drop=True)elifsection=='all':returndfelifsection=='group':returndf.groupby(level='page').sum()defline_counts(self,section='default'):''' Return a list of line counts, per page '''return[page.line_count(section=section)forpageinself.pages()]defempty_line_counts(self,section='default'):''' Return a list of empty line counts, per page '''return[page.empty_line_count(section=section)forpageinself.pages()]defcap_alpha_seq(self,section='body'):logging.warn("At the volume-level, use Volume.cap_alpha_seqs()")returnself.cap_alpha_seqs(section)defcap_alpha_seqs(self,section='body'):''' Return the longest length of consecutive capital letters starting a line on the page. Returns a list for all pages. Only includes the body: header/footer information is not included. '''ifsection!='body':logging.warn("cap_alpha_seq only includes counts for the body ""section of pages.")return[page.cap_alpha_seq()forpageinself.pages()]defsentence_counts(self,section='default'):''' Return a list of sentence counts, per page '''return[page.sentence_count(section=section)forpageinself.pages()]deftokenlist(self,pages=True,section='default',case=True,pos=True,page_freq=False):''' Get or set tokencounts DataFrame pages[bool]: Keep page-level info if true, else fold. section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word page_freq[bool] : Whether to count page frequency (1 if it occurs on the page, else 0) or a term frequency (counts for the term, per page) '''ifsection=='default':section=self.default_page_section# Create the internal representation if it does not already# exist. This will only need to exist onceifself._tokencounts.empty:self._tokencounts=self._make_tokencount_df(self._pages)returngroup_tokenlist(self._tokencounts,pages=pages,section=section,case=case,pos=pos,page_freq=page_freq)defterm_page_freqs(self,page_freq=True,case=True):''' Return a term frequency x page matrix, or optionally a page frequency x page matrix '''all_page_dfs=self.tokenlist(page_freq=page_freq,case=case)returnall_page_dfs.reset_index()\
.groupby(['token','page'],as_index=False).sum()\
.pivot(index='page',columns='token',values='count')\
.fillna(0)defterm_volume_freqs(self,page_freq=True,pos=True,case=True):''' Return a list of each term's frequency in the entire volume '''df=self.tokenlist(page_freq=page_freq,pos=pos,case=case)groups=['token']ifnotposelse['token','pos']returndf.reset_index().drop(['page'],axis=1)\
.groupby(groups,as_index=False).sum()\
.sort_values(by='count',ascending=False)defend_line_chars(self,**args):''' Get counts of characters at the end of lines, i.e. the characters on the far right of the page. '''returnself.line_chars(place='end',**args)defbegin_line_chars(self,**args):''' Get counts of characters at the begin of lines, i.e. the characters on the far left of the page. '''returnself.line_chars(place='begin',**args)defline_chars(self,section='default',place='all'):''' Interface for all begin/end of line character information. '''ifself._schema=='2.0'andnotself._has_advanced:logging.error("For schema version 2.0, you need load the ""'advanced' file for begin/endLineChars")returnifsection=='default':section=self.default_page_sectionifself._line_chars.emptyandself._has_advanced:logging.error("Something went wrong. Expected Advanced features"" to already be processed")returnelifself._line_chars.emptyandnotself._has_advanced:self._line_chars=self._make_line_char_df(self._pages)df=self._line_charsreturngroup_linechars(df,section=section,place=place)def_make_tokencount_df(self,pages):''' Returns a Pandas dataframe of: page / section / place(i.e. begin/end) / char / count '''ifself._schema=='1.0':tname='tokens'else:tname='tokenPosCount'# Make structured numpy array# Because it is typed, this approach is ~40x faster than earlier# methodsm=sum([page['tokenCount']forpageinpages])arr=np.zeros(m,dtype=[(str('page'),str('u8')),(str('section'),str('U6')),(str('token'),str('U64')),(str('pos'),str('U6')),(str('count'),str('u4'))])i=0forpageinpages:forsecin['header','body','footer']:fortoken,posvaluesiniteritems(page[sec][tname]):forpos,valueiniteritems(posvalues):arr[i]=(page['seq'],sec,token,pos,value)i+=1if(i>m+1):logging.error("This volume has more token info ""than the internal representation ""allows. Email organisciak@gmail.com""to let the library author know!")# Create a DataFramedf=pd.DataFrame(arr[:i]).set_index(['page','section','token','pos'])df.sort_index(inplace=True,level=0,sort_remaining=True)returndfdef_make_line_char_df(self,pages):''' Returns a Pandas dataframe of: page / section / place(i.e. begin/end) / char / count Provide an array of pages that hold beginLineChars and endLineChars. '''ifself._schema=='3.0':logging.warn("Adapted to erroneous key names in schema 3.0.")place_key=[('begin','beginCharCounts'),('end','endCharCount')]else:place_key=[('begin','beginLineChars'),('end','endLineChars')]# Make structured numpy array# Because it is typed, this approach is ~40x faster than earlier# methodsm=len(pages)*3*2# Pages * section types * placesarr=np.zeros(int(m*100),dtype=[(str('page'),str('u8')),(str('section'),str('U6')),(str('place'),str('U5')),(str('char'),str('U1')),(str('count'),str('u8'))])i=0forpageinpages:forsecin['header','body','footer']:forplace,json_keyinplace_key:forchar,valueiniteritems(page[sec][json_key]):arr[i]=(page['seq'],sec,place,char,value)i+=1# Create a DataFramedf=pd.DataFrame(arr[:i]).set_index(['page','section','place','char'])df.sortlevel(inplace=True)returndfdef__str__(self):deftruncate(s,maxlen):iflen(s)>maxlen:returns[:maxlen].strip()+"..."else:returns.strip()return"<Volume: %s (%s) by %s>"%(truncate(self.title,30),self.year,truncate(self.author[0],40))

defcap_alpha_seqs(self,section='body'):''' Return the longest length of consecutive capital letters starting a line on the page. Returns a list for all pages. Only includes the body: header/footer information is not included. '''ifsection!='body':logging.warn("cap_alpha_seq only includes counts for the body ""section of pages.")return[page.cap_alpha_seq()forpageinself.pages()]

defline_chars(self,section='default',place='all'):''' Interface for all begin/end of line character information. '''ifself._schema=='2.0'andnotself._has_advanced:logging.error("For schema version 2.0, you need load the ""'advanced' file for begin/endLineChars")returnifsection=='default':section=self.default_page_sectionifself._line_chars.emptyandself._has_advanced:logging.error("Something went wrong. Expected Advanced features"" to already be processed")returnelifself._line_chars.emptyandnotself._has_advanced:self._line_chars=self._make_line_char_df(self._pages)df=self._line_charsreturngroup_linechars(df,section=section,place=place)

defterm_volume_freqs(self,page_freq=True,pos=True,case=True):''' Return a list of each term's frequency in the entire volume '''df=self.tokenlist(page_freq=page_freq,pos=pos,case=case)groups=['token']ifnotposelse['token','pos']returndf.reset_index().drop(['page'],axis=1)\
.groupby(groups,as_index=False).sum()\
.sort_values(by='count',ascending=False)

section[string]: Which part of the page to return. In addition to
'header', 'body', and 'footer', 'all' will return a DataFrame with
all the sections, 'group' will sum all sections,
section in ['header', 'footer', 'body'] will return those fields
section == 'all' will group and sum all sections
section == 'default' falls back on what the page object has saved

case[bool] : Preserve case, or fold.

pos[bool] : Specify whether to return frequencies per part of speech,
or simply by word

page_freq[bool] : Whether to count page frequency (1 if it occurs on
the page, else 0) or a term frequency (counts for the term, per page)

deftokenlist(self,pages=True,section='default',case=True,pos=True,page_freq=False):''' Get or set tokencounts DataFrame pages[bool]: Keep page-level info if true, else fold. section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word page_freq[bool] : Whether to count page frequency (1 if it occurs on the page, else 0) or a term frequency (counts for the term, per page) '''ifsection=='default':section=self.default_page_section# Create the internal representation if it does not already# exist. This will only need to exist onceifself._tokencounts.empty:self._tokencounts=self._make_tokencount_df(self._pages)returngroup_tokenlist(self._tokencounts,pages=pages,section=section,case=case,pos=pos,page_freq=page_freq)