mirror of
https://github.com/kennethreitz-archive/usajobs-scraper.git
synced 2026-06-05 23:40:16 +00:00
186 lines
5.1 KiB
Python
186 lines
5.1 KiB
Python
from BeautifulSoup import BeautifulSoup
|
|
from kenlib import *
|
|
import re
|
|
import urllib
|
|
import MySQLdb
|
|
|
|
fields = ['id', 'title', 'closingDate', 'agency', 'locationCity', 'locationState', 'locationCountry', 'salary', 'details', 'vacancy', 'apply', 'plan', 'appointmentTerm', 'jobStatus', 'openingDate']
|
|
sqlfields = map(snake_case, fields[:])
|
|
|
|
# Open up output file
|
|
html = open('./output').read() # LOCAL
|
|
soup = BeautifulSoup(''.join(html))
|
|
|
|
class Job():
|
|
def __init__(self, id, title, closingDate, agency, locationCity, locationState, locationCountry, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate):
|
|
try:
|
|
self.id = id
|
|
self.title = title
|
|
self.closingDate = closingDate
|
|
self.agency = agency
|
|
# self.location = ', '.join(location)
|
|
self.locationCity = locationCity
|
|
self.locationState = locationState
|
|
self.locationCountry = locationCountry
|
|
self.salary = salary
|
|
self.details = details
|
|
self.vacancy = vacancy
|
|
self.apply = apply
|
|
self.plan = plan
|
|
self.appointmentTerm = appointmentTerm
|
|
self.jobStatus = jobStatus
|
|
self.openingDate = openingDate
|
|
|
|
except Exception, e:
|
|
del self
|
|
|
|
def __str__(self):
|
|
return "%s -> %s" % (self.title, self.vacancy)
|
|
|
|
def store(self):
|
|
conn = MySQLdb.connect(host="localhost", user="admin", passwd="drummer42", db="scraper")
|
|
cursor = conn.cursor ()
|
|
query = "INSERT INTO jobs (%s) " % (', '.join(sqlfields))+ \
|
|
"VALUES (%s )" % \
|
|
', '.join(conn.literal(f) for f in
|
|
[self.id, self.title, self.closingDate,
|
|
self.agency, self.locationCity, self.locationState, self.locationCountry,
|
|
self.salary, self.details,
|
|
self.vacancy, self.apply,
|
|
self.plan, self.appointmentTerm,
|
|
self.jobStatus, self.openingDate ])
|
|
cursor.execute (query)
|
|
cursor.close ()
|
|
conn.close ()
|
|
del self
|
|
|
|
def inspect(self):
|
|
print self.title, self.id
|
|
|
|
def scrapePage():
|
|
b = map(getResults, fields)
|
|
resultSet = []
|
|
for i in qrange(len(fields)):
|
|
newJob = Job(*[b[j][i] for j in qrange(len(fields))])
|
|
del b
|
|
resultSet.append(newJob)
|
|
del newJob
|
|
|
|
return resultSet
|
|
|
|
def getResults(rtype):
|
|
|
|
def getMeta(type, title):
|
|
print "fetching " + title + "s"
|
|
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (title)
|
|
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
|
|
return soup.findAll(type, { 'id' : rg})
|
|
|
|
def scrub(dirty, meta = 'contents'):
|
|
if meta == 'contents':
|
|
for i in qrange(len(dirty)):
|
|
dirty[i] = dirty[i].contents[0].replace('\n','')
|
|
clean = utf8ify(dirty)
|
|
|
|
elif meta == 'href':
|
|
for i in qrange(len(dirty)):
|
|
dirty[i] = dirty[i]['href']
|
|
clean = utf8ify(dirty)
|
|
|
|
return clean
|
|
|
|
# begin proccessing
|
|
|
|
if rtype == 'id':
|
|
results = scrub(getMeta('a', 'lnkTitle'), 'href')
|
|
for i in qrange(len(results)):
|
|
results[i] = results[i][45:52]
|
|
|
|
elif rtype == 'title':
|
|
results = scrub(getMeta('a', 'lnkTitle'))
|
|
|
|
elif rtype == 'closingDate':
|
|
results = scrub(getMeta('span', 'lblDateMiles'))
|
|
|
|
elif rtype == 'agency':
|
|
results = scrub(getMeta('span', 'lblCompany'))
|
|
|
|
elif rtype == 'location':
|
|
results = scrub(getMeta('span', 'lblArea'))
|
|
results = [item.split('-') for item in results]
|
|
|
|
elif rtype == 'locationCity':
|
|
results = scrub(getMeta('span', 'lblArea'))
|
|
results = [item.split('-') for item in results]
|
|
for i in range(len(results)):
|
|
try:
|
|
results[i] = results[i][2]
|
|
except Exception, e:
|
|
results[i] = results[i][1]
|
|
|
|
|
|
elif rtype == 'locationState':
|
|
results = scrub(getMeta('span', 'lblArea'))
|
|
results = [item.split('-') for item in results]
|
|
for i in range(len(results)):
|
|
results[i] = results[i][1]
|
|
|
|
elif rtype == 'locationCountry':
|
|
results = scrub(getMeta('span', 'lblArea'))
|
|
results = [item.split('-') for item in results]
|
|
for i in range(len(results)):
|
|
results[i] = results[i][0]
|
|
|
|
elif rtype == 'salaryRange':
|
|
results = scrub(getMeta('span', 'lblArea'))
|
|
|
|
elif rtype == 'salary':
|
|
results = scrub(getMeta('span', 'lblSalary'))
|
|
results = [item.replace(',', '').replace('$', '').replace('+', '').replace('.00', '') for item in results]
|
|
|
|
elif rtype == 'details':
|
|
results = scrub(getMeta('div', 'jobDetailBodyDiv'))
|
|
|
|
elif rtype == 'vacancy':
|
|
results = scrub(getMeta('span', 'lblVacancyAnnNumber'))
|
|
|
|
elif rtype == 'apply':
|
|
results = scrub(getMeta('span', 'lblWhoMayApply'))
|
|
|
|
elif rtype == 'plan':
|
|
results = scrub(getMeta('span', 'lblPayPlan'))
|
|
|
|
elif rtype == 'appointmentTerm':
|
|
results = scrub(getMeta('span', 'lblAppointmentTerm'))
|
|
|
|
elif rtype == 'jobStatus':
|
|
results = scrub(getMeta('span', 'lblJobStatus'))
|
|
|
|
elif rtype == 'openingDate':
|
|
results = scrub(getMeta('span', 'lblOpeningDate'))
|
|
|
|
elif rtype == 'salaryRange':
|
|
results = scrub(getMeta('span', 'lblSalaryRange'))
|
|
|
|
else:
|
|
raise NameError
|
|
|
|
return results
|
|
|
|
def clearDB():
|
|
print "clearing DB"
|
|
conn = MySQLdb.connect(host="localhost", user="admin", passwd="drummer42", db="scraper")
|
|
cursor = conn.cursor ()
|
|
cursor.execute ("TRUNCATE TABLE jobs")
|
|
cursor.close ()
|
|
conn.close ()
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
clearDB()
|
|
for job in scrapePage():
|
|
print "Storing: %s" % job.title
|
|
job.store()
|
|
|
|
# print getResults('salary') |