Files
usajobs-scraper/scraper.py
T
Kenneth Reitz ba4276e7cb Update
2009-11-03 17:48:26 -05:00

105 lines
4.7 KiB
Python

from BeautifulSoup import BeautifulSoup
from kenlib import *
import re
import urllib
import MySQLdb
class Job():
def __init__(self, title, closingDate, agency, location, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate, salaryRange):
self.title = title[0].encode('utf-8').replace('\n','')
self.closingDate = closingDate[0].encode('utf-8').replace('\n','')
self.agency = agency[0].encode('utf-8').replace('\n','')
self.location = location[0].encode('utf-8').replace('\n','')
self.salary = salary[0].encode('utf-8').replace('\n','')
self.details = details[0].encode('utf-8').replace('\n','')
self.vacancy = vacancy[0].encode('utf-8').replace('\n','')
self.apply = apply[0].encode('utf-8').replace('\n','')
self.plan = plan[0].encode('utf-8').replace('\n','')
self.appointmentTerm = appointmentTerm[0].encode('utf-8').replace('\n','')
self.jobStatus = jobStatus[0].encode('utf-8').replace('\n','')
self.openingDate = openingDate[0].encode('utf-8').replace('\n','')
self.salaryRange = salaryRange[0].encode('utf-8').replace('\n','')
def __str__(self):
return "%s -> %s" % (self.title, self.vacancy)
def store(self):
conn = MySQLdb.connect(host="kennethreitz.com", user="admin", passwd="drummer42", db="scraper")
cursor = conn.cursor ()
# title, closingDate, agency, location, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate, salaryRange
query = "INSERT INTO jobs (title, closing_date, agency, location, salary, details, vacancy, apply, plan, appointment_term, job_status, opening_date, salary_range) " + \
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )" % \
(conn.literal(self.title), conn.literal(self.closingDate),
conn.literal(self.agency), conn.literal(self.location),
conn.literal(self.salary), conn.literal(self.details),
conn.literal(self.vacancy), conn.literal(self.apply),
conn.literal(self.plan), conn.literal(self.appointmentTerm),
conn.literal(self.jobStatus), conn.literal(self.openingDate),
conn.literal(self.salaryRange))
cursor.execute (query)
cursor.close ()
conn.close ()
def inspect(self):
print self.title
print self.closingDate
print self.agency
print self.location
print self.salary
print self.details
print self.vacancy
print self.apply
print self.plan
print self.appointmentTerm
print self.jobStatus
print self.openingDate
print self.salaryRange
print '\n' * 4
html = open('./output').read() # LOCAL
soup = BeautifulSoup(''.join(html))
def scrapePage():
b = [getResults('title'), getResults('closingDate'), getResults('agency'), getResults('location'), getResults('salary'), getResults('details'), getResults('vacancy'), getResults('apply'), getResults('plan'), getResults('appointmentTerm'), getResults('jobStatus'), getResults('openingDate'), getResults('salaryRange')]
resultSet = []
for i in range(len(b[0])):
newJob = Job( b[0][i].contents, b[1][i].contents, b[2][i].contents,
b[3][i].contents, b[4][i].contents, b[5][i].contents,
b[6][i].contents, b[7][i].contents, b[8][i].contents,
b[9][i].contents, b[10][i].contents, b[11][i].contents,
b[12][i].contents
)
resultSet.append(newJob)
return resultSet
def getResults(rtype):
def getMeta(type, title):
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (title)
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
return soup.findAll(type, { 'id' : rg})
if rtype == 'title': return getMeta('a', 'lnkTitle')
elif rtype == 'closingDate': return getMeta('span', 'lblDateMiles')
elif rtype == 'agency': return getMeta('span', 'lblCompany')
elif rtype == 'location': return getMeta('span', 'lblArea')
elif rtype == 'salary': return getMeta('span', 'lblSalary')
elif rtype == 'details': return getMeta('div', 'jobDetailBodyDiv')
elif rtype == 'vacancy': return getMeta('span', 'lblVacancyAnnNumber')
elif rtype == 'apply': return getMeta('span', 'lblWhoMayApply')
elif rtype == 'plan': return getMeta('span', 'lblPayPlan')
elif rtype == 'appointmentTerm': return getMeta('span', 'lblAppointmentTerm')
elif rtype == 'jobStatus': return getMeta('span', 'lblJobStatus')
elif rtype == 'openingDate': return getMeta('span', 'lblOpeningDate')
elif rtype == 'salaryRange': return getMeta('span', 'lblSalaryRange')
if __name__ == '__main__':
for job in scrapePage():
print "Storing %s..." $ (job.title)
job.store()
# print len(job)
# for t in ['title', 'closingDate', 'agency', 'location', 'salary', 'details', 'vacancy', 'apply', 'plan', 'appointmentTerm', 'jobStatus', 'openingDate', 'salaryRange']:
# print getResults(t)[0].contents[0].encode('utf-8').replace('\n','')
# print '>'