mirror of
https://github.com/kennethreitz-archive/usajobs-scraper.git
synced 2026-06-05 23:40:16 +00:00
105 lines
4.7 KiB
Python
105 lines
4.7 KiB
Python
from BeautifulSoup import BeautifulSoup
|
|
from kenlib import *
|
|
import re
|
|
import urllib
|
|
import MySQLdb
|
|
|
|
class Job():
|
|
def __init__(self, title, closingDate, agency, location, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate, salaryRange):
|
|
self.title = title[0].encode('utf-8').replace('\n','')
|
|
self.closingDate = closingDate[0].encode('utf-8').replace('\n','')
|
|
self.agency = agency[0].encode('utf-8').replace('\n','')
|
|
self.location = location[0].encode('utf-8').replace('\n','')
|
|
self.salary = salary[0].encode('utf-8').replace('\n','')
|
|
self.details = details[0].encode('utf-8').replace('\n','')
|
|
self.vacancy = vacancy[0].encode('utf-8').replace('\n','')
|
|
self.apply = apply[0].encode('utf-8').replace('\n','')
|
|
self.plan = plan[0].encode('utf-8').replace('\n','')
|
|
self.appointmentTerm = appointmentTerm[0].encode('utf-8').replace('\n','')
|
|
self.jobStatus = jobStatus[0].encode('utf-8').replace('\n','')
|
|
self.openingDate = openingDate[0].encode('utf-8').replace('\n','')
|
|
self.salaryRange = salaryRange[0].encode('utf-8').replace('\n','')
|
|
|
|
def __str__(self):
|
|
return "%s -> %s" % (self.title, self.vacancy)
|
|
|
|
def store(self):
|
|
conn = MySQLdb.connect(host="kennethreitz.com", user="admin", passwd="drummer42", db="scraper")
|
|
cursor = conn.cursor ()
|
|
# title, closingDate, agency, location, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate, salaryRange
|
|
query = "INSERT INTO jobs (title, closing_date, agency, location, salary, details, vacancy, apply, plan, appointment_term, job_status, opening_date, salary_range) " + \
|
|
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )" % \
|
|
(conn.literal(self.title), conn.literal(self.closingDate),
|
|
conn.literal(self.agency), conn.literal(self.location),
|
|
conn.literal(self.salary), conn.literal(self.details),
|
|
conn.literal(self.vacancy), conn.literal(self.apply),
|
|
conn.literal(self.plan), conn.literal(self.appointmentTerm),
|
|
conn.literal(self.jobStatus), conn.literal(self.openingDate),
|
|
conn.literal(self.salaryRange))
|
|
cursor.execute (query)
|
|
cursor.close ()
|
|
conn.close ()
|
|
|
|
def inspect(self):
|
|
print self.title
|
|
print self.closingDate
|
|
print self.agency
|
|
print self.location
|
|
print self.salary
|
|
print self.details
|
|
print self.vacancy
|
|
print self.apply
|
|
print self.plan
|
|
print self.appointmentTerm
|
|
print self.jobStatus
|
|
print self.openingDate
|
|
print self.salaryRange
|
|
print '\n' * 4
|
|
|
|
html = open('./output').read() # LOCAL
|
|
soup = BeautifulSoup(''.join(html))
|
|
|
|
def scrapePage():
|
|
b = [getResults('title'), getResults('closingDate'), getResults('agency'), getResults('location'), getResults('salary'), getResults('details'), getResults('vacancy'), getResults('apply'), getResults('plan'), getResults('appointmentTerm'), getResults('jobStatus'), getResults('openingDate'), getResults('salaryRange')]
|
|
resultSet = []
|
|
for i in range(len(b[0])):
|
|
newJob = Job( b[0][i].contents, b[1][i].contents, b[2][i].contents,
|
|
b[3][i].contents, b[4][i].contents, b[5][i].contents,
|
|
b[6][i].contents, b[7][i].contents, b[8][i].contents,
|
|
b[9][i].contents, b[10][i].contents, b[11][i].contents,
|
|
b[12][i].contents
|
|
)
|
|
resultSet.append(newJob)
|
|
|
|
return resultSet
|
|
|
|
def getResults(rtype):
|
|
|
|
def getMeta(type, title):
|
|
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (title)
|
|
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
|
|
return soup.findAll(type, { 'id' : rg})
|
|
|
|
if rtype == 'title': return getMeta('a', 'lnkTitle')
|
|
elif rtype == 'closingDate': return getMeta('span', 'lblDateMiles')
|
|
elif rtype == 'agency': return getMeta('span', 'lblCompany')
|
|
elif rtype == 'location': return getMeta('span', 'lblArea')
|
|
elif rtype == 'salary': return getMeta('span', 'lblSalary')
|
|
elif rtype == 'details': return getMeta('div', 'jobDetailBodyDiv')
|
|
elif rtype == 'vacancy': return getMeta('span', 'lblVacancyAnnNumber')
|
|
elif rtype == 'apply': return getMeta('span', 'lblWhoMayApply')
|
|
elif rtype == 'plan': return getMeta('span', 'lblPayPlan')
|
|
elif rtype == 'appointmentTerm': return getMeta('span', 'lblAppointmentTerm')
|
|
elif rtype == 'jobStatus': return getMeta('span', 'lblJobStatus')
|
|
elif rtype == 'openingDate': return getMeta('span', 'lblOpeningDate')
|
|
elif rtype == 'salaryRange': return getMeta('span', 'lblSalaryRange')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
for job in scrapePage():
|
|
print "Storing %s..." $ (job.title)
|
|
job.store()
|
|
# print len(job)
|
|
# for t in ['title', 'closingDate', 'agency', 'location', 'salary', 'details', 'vacancy', 'apply', 'plan', 'appointmentTerm', 'jobStatus', 'openingDate', 'salaryRange']:
|
|
# print getResults(t)[0].contents[0].encode('utf-8').replace('\n','')
|
|
# print '>' |