Main scraping function. Moves through each page of the list of NYC tutors. Statement to check for the end of the list is not currently working. Need to ctrl-c out of the program to stop scraping.

"""

whileTrue:

print'Scraping page '+str(self.page)

# Format of url is http://www.wyzant.com/New_York_City_tutors.aspx?sl=80075877&sort=27&pagesize=5&pagenum=1 where the 1 at the end is the page number. Increase page number by 1 each iteration of while loop in order to crawl through pages.

self.current_url =self.region_url +str(self.page)

ufile = urllib2.urlopen(self.current_url)

if ufile.geturl()[-1:] ==self.base_url:

break# Check for end of list

people = BeautifulSoup(ufile).findAll('div', {'class':'tutorFR hide medium-show'}) # picks out each person on the page

self.scrape_people(people)

self.page +=1# Next page

defscrape_people(self, people):

"""

Scrapes profile info from all people on a given page of WyzAnt. Runs all class methods for picking out different profile features.

"""

for person in people:

name =''

hourly_rate =int(-1)

self.current_url =self.base_url + person.find('a')['href']

# Should do a better version of try/excepting of connection issues.

try:

person_ufile = urllib2.urlopen(self.current_url)

except:

time.sleep(1)

person_ufile = urllib2.urlopen(self.current_url)

soup = BeautifulSoup(person_ufile)

# Get various features by running all class methods.

name =self.get_name(soup)

hourly_rate =self.get_hourly_rate(soup)

raw_subjects, qual_subjects, linked_subjects =self.get_subjects(soup)

education =self.get_education(soup)

badge =self.get_tutor_badge(soup)

profile_picture =self.has_profile_picture(soup)

rating, number_of_ratings =self.get_rating(soup)

zip_radius, zip_code =self.get_zip_code(soup)

student_reviews =self.get_student_reviews(soup)

background_check =self.get_background_check(soup)

response_time =self.get_response_time(soup)

bio =self.get_bio(soup)

print name

# Write features to dictionary.

person_row = {}

person_row['name'] = name

person_row['hourly_rate'] = hourly_rate

person_row['raw_subjects'] = raw_subjects

person_row['qual_subjects'] = qual_subjects

person_row['linked_subjects'] = linked_subjects

person_row['education'] = education

person_row['badge'] = badge

person_row['profile_picture'] = profile_picture

person_row['rating'] = rating

person_row['number_of_ratings'] = number_of_ratings

person_row['zip_radius'] = zip_radius

person_row['zip_code'] = zip_code

person_row['student_reviews'] = student_reviews

person_row['background_check'] = background_check

person_row['response_time'] = response_time

person_row['bio'] = bio

person_row['url'] =self.current_url

# Write dictionary to JSON ouput file.

json.dump(person_row, self.outfile)

self.outfile.write('\n')

self.idx +=1

defget_name(self, soup):

"""

Get tutors name. Just returns first name and last initial. ex: Ethan R.