from BeautifulSoup import BeautifulSoup from kenlib import * import re import urllib import MySQLdb fields = ['id', 'title', 'closingDate', 'agency', 'locationCity', 'locationState', 'locationCountry', 'salary', 'details', 'vacancy', 'apply', 'plan', 'appointmentTerm', 'jobStatus', 'openingDate'] sqlfields = map(snake_case, fields[:]) # Open up output file html = open('./output').read() # LOCAL soup = BeautifulSoup(''.join(html)) class Job(): def __init__(self, id, title, closingDate, agency, locationCity, locationState, locationCountry, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate): try: self.id = id self.title = title self.closingDate = closingDate self.agency = agency # self.location = ', '.join(location) self.locationCity = locationCity self.locationState = locationState self.locationCountry = locationCountry self.salary = salary self.details = details self.vacancy = vacancy self.apply = apply self.plan = plan self.appointmentTerm = appointmentTerm self.jobStatus = jobStatus self.openingDate = openingDate except Exception, e: del self def __str__(self): return "%s -> %s" % (self.title, self.vacancy) def store(self): conn = MySQLdb.connect(host="localhost", user="admin", passwd="drummer42", db="scraper") cursor = conn.cursor () query = "INSERT INTO jobs (%s) " % (', '.join(sqlfields))+ \ "VALUES (%s )" % \ ', '.join(conn.literal(f) for f in [self.id, self.title, self.closingDate, self.agency, self.locationCity, self.locationState, self.locationCountry, self.salary, self.details, self.vacancy, self.apply, self.plan, self.appointmentTerm, self.jobStatus, self.openingDate ]) cursor.execute (query) cursor.close () conn.close () del self def inspect(self): print self.title, self.id def scrapePage(): b = map(getResults, fields) resultSet = [] for i in qrange(len(fields)): newJob = Job(*[b[j][i] for j in qrange(len(fields))]) del b resultSet.append(newJob) del newJob return resultSet def getResults(rtype): def getMeta(type, title): print "fetching " + title + "s" re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (title) rg = re.compile(re0,re.IGNORECASE|re.DOTALL) return soup.findAll(type, { 'id' : rg}) def scrub(dirty, meta = 'contents'): if meta == 'contents': for i in qrange(len(dirty)): dirty[i] = dirty[i].contents[0].replace('\n','') clean = utf8ify(dirty) elif meta == 'href': for i in qrange(len(dirty)): dirty[i] = dirty[i]['href'] clean = utf8ify(dirty) return clean # begin proccessing if rtype == 'id': results = scrub(getMeta('a', 'lnkTitle'), 'href') for i in qrange(len(results)): results[i] = results[i][45:52] elif rtype == 'title': results = scrub(getMeta('a', 'lnkTitle')) elif rtype == 'closingDate': results = scrub(getMeta('span', 'lblDateMiles')) elif rtype == 'agency': results = scrub(getMeta('span', 'lblCompany')) elif rtype == 'location': results = scrub(getMeta('span', 'lblArea')) results = [item.split('-') for item in results] elif rtype == 'locationCity': results = scrub(getMeta('span', 'lblArea')) results = [item.split('-') for item in results] for i in range(len(results)): try: results[i] = results[i][2] except Exception, e: results[i] = results[i][1] elif rtype == 'locationState': results = scrub(getMeta('span', 'lblArea')) results = [item.split('-') for item in results] for i in range(len(results)): results[i] = results[i][1] elif rtype == 'locationCountry': results = scrub(getMeta('span', 'lblArea')) results = [item.split('-') for item in results] for i in range(len(results)): results[i] = results[i][0] elif rtype == 'salaryRange': results = scrub(getMeta('span', 'lblArea')) elif rtype == 'salary': results = scrub(getMeta('span', 'lblSalary')) results = [item.replace(',', '').replace('$', '').replace('+', '').replace('.00', '') for item in results] elif rtype == 'details': results = scrub(getMeta('div', 'jobDetailBodyDiv')) elif rtype == 'vacancy': results = scrub(getMeta('span', 'lblVacancyAnnNumber')) elif rtype == 'apply': results = scrub(getMeta('span', 'lblWhoMayApply')) elif rtype == 'plan': results = scrub(getMeta('span', 'lblPayPlan')) elif rtype == 'appointmentTerm': results = scrub(getMeta('span', 'lblAppointmentTerm')) elif rtype == 'jobStatus': results = scrub(getMeta('span', 'lblJobStatus')) elif rtype == 'openingDate': results = scrub(getMeta('span', 'lblOpeningDate')) elif rtype == 'salaryRange': results = scrub(getMeta('span', 'lblSalaryRange')) else: raise NameError return results def clearDB(): print "clearing DB" conn = MySQLdb.connect(host="localhost", user="admin", passwd="drummer42", db="scraper") cursor = conn.cursor () cursor.execute ("TRUNCATE TABLE jobs") cursor.close () conn.close () if __name__ == '__main__': clearDB() for job in scrapePage(): print "Storing: %s" % job.title job.store() # print getResults('salary')