Files
Kenneth Reitz 8e533d17d0 Update
2009-11-04 15:58:48 -05:00

186 lines
5.1 KiB
Python

from BeautifulSoup import BeautifulSoup
from kenlib import *
import re
import urllib
import MySQLdb
fields = ['id', 'title', 'closingDate', 'agency', 'locationCity', 'locationState', 'locationCountry', 'salary', 'details', 'vacancy', 'apply', 'plan', 'appointmentTerm', 'jobStatus', 'openingDate']
sqlfields = map(snake_case, fields[:])
# Open up output file
html = open('./output').read() # LOCAL
soup = BeautifulSoup(''.join(html))
class Job():
def __init__(self, id, title, closingDate, agency, locationCity, locationState, locationCountry, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate):
try:
self.id = id
self.title = title
self.closingDate = closingDate
self.agency = agency
# self.location = ', '.join(location)
self.locationCity = locationCity
self.locationState = locationState
self.locationCountry = locationCountry
self.salary = salary
self.details = details
self.vacancy = vacancy
self.apply = apply
self.plan = plan
self.appointmentTerm = appointmentTerm
self.jobStatus = jobStatus
self.openingDate = openingDate
except Exception, e:
del self
def __str__(self):
return "%s -> %s" % (self.title, self.vacancy)
def store(self):
conn = MySQLdb.connect(host="localhost", user="admin", passwd="drummer42", db="scraper")
cursor = conn.cursor ()
query = "INSERT INTO jobs (%s) " % (', '.join(sqlfields))+ \
"VALUES (%s )" % \
', '.join(conn.literal(f) for f in
[self.id, self.title, self.closingDate,
self.agency, self.locationCity, self.locationState, self.locationCountry,
self.salary, self.details,
self.vacancy, self.apply,
self.plan, self.appointmentTerm,
self.jobStatus, self.openingDate ])
cursor.execute (query)
cursor.close ()
conn.close ()
del self
def inspect(self):
print self.title, self.id
def scrapePage():
b = map(getResults, fields)
resultSet = []
for i in qrange(len(fields)):
newJob = Job(*[b[j][i] for j in qrange(len(fields))])
del b
resultSet.append(newJob)
del newJob
return resultSet
def getResults(rtype):
def getMeta(type, title):
print "fetching " + title + "s"
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (title)
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
return soup.findAll(type, { 'id' : rg})
def scrub(dirty, meta = 'contents'):
if meta == 'contents':
for i in qrange(len(dirty)):
dirty[i] = dirty[i].contents[0].replace('\n','')
clean = utf8ify(dirty)
elif meta == 'href':
for i in qrange(len(dirty)):
dirty[i] = dirty[i]['href']
clean = utf8ify(dirty)
return clean
# begin proccessing
if rtype == 'id':
results = scrub(getMeta('a', 'lnkTitle'), 'href')
for i in qrange(len(results)):
results[i] = results[i][45:52]
elif rtype == 'title':
results = scrub(getMeta('a', 'lnkTitle'))
elif rtype == 'closingDate':
results = scrub(getMeta('span', 'lblDateMiles'))
elif rtype == 'agency':
results = scrub(getMeta('span', 'lblCompany'))
elif rtype == 'location':
results = scrub(getMeta('span', 'lblArea'))
results = [item.split('-') for item in results]
elif rtype == 'locationCity':
results = scrub(getMeta('span', 'lblArea'))
results = [item.split('-') for item in results]
for i in range(len(results)):
try:
results[i] = results[i][2]
except Exception, e:
results[i] = results[i][1]
elif rtype == 'locationState':
results = scrub(getMeta('span', 'lblArea'))
results = [item.split('-') for item in results]
for i in range(len(results)):
results[i] = results[i][1]
elif rtype == 'locationCountry':
results = scrub(getMeta('span', 'lblArea'))
results = [item.split('-') for item in results]
for i in range(len(results)):
results[i] = results[i][0]
elif rtype == 'salaryRange':
results = scrub(getMeta('span', 'lblArea'))
elif rtype == 'salary':
results = scrub(getMeta('span', 'lblSalary'))
results = [item.replace(',', '').replace('$', '').replace('+', '').replace('.00', '') for item in results]
elif rtype == 'details':
results = scrub(getMeta('div', 'jobDetailBodyDiv'))
elif rtype == 'vacancy':
results = scrub(getMeta('span', 'lblVacancyAnnNumber'))
elif rtype == 'apply':
results = scrub(getMeta('span', 'lblWhoMayApply'))
elif rtype == 'plan':
results = scrub(getMeta('span', 'lblPayPlan'))
elif rtype == 'appointmentTerm':
results = scrub(getMeta('span', 'lblAppointmentTerm'))
elif rtype == 'jobStatus':
results = scrub(getMeta('span', 'lblJobStatus'))
elif rtype == 'openingDate':
results = scrub(getMeta('span', 'lblOpeningDate'))
elif rtype == 'salaryRange':
results = scrub(getMeta('span', 'lblSalaryRange'))
else:
raise NameError
return results
def clearDB():
print "clearing DB"
conn = MySQLdb.connect(host="localhost", user="admin", passwd="drummer42", db="scraper")
cursor = conn.cursor ()
cursor.execute ("TRUNCATE TABLE jobs")
cursor.close ()
conn.close ()
if __name__ == '__main__':
clearDB()
for job in scrapePage():
print "Storing: %s" % job.title
job.store()
# print getResults('salary')