This commit is contained in:
Kenneth Reitz
2009-11-03 17:48:26 -05:00
parent 464fa4087e
commit ba4276e7cb
3 changed files with 1109146 additions and 73 deletions
+1109059
View File
File diff suppressed because one or more lines are too long
+85 -70
View File
@@ -2,89 +2,104 @@ from BeautifulSoup import BeautifulSoup
from kenlib import *
import re
import urllib
import MySQLdb
class Job():
def __init__(self, title="", closingDate="", agency="", location="", salary=""):
self.title = title
self.closingDate = closingDate
self.agency = agency
self.location = location
self.salary = salary
# self.details = details
def __init__(self, title, closingDate, agency, location, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate, salaryRange):
self.title = title[0].encode('utf-8').replace('\n','')
self.closingDate = closingDate[0].encode('utf-8').replace('\n','')
self.agency = agency[0].encode('utf-8').replace('\n','')
self.location = location[0].encode('utf-8').replace('\n','')
self.salary = salary[0].encode('utf-8').replace('\n','')
self.details = details[0].encode('utf-8').replace('\n','')
self.vacancy = vacancy[0].encode('utf-8').replace('\n','')
self.apply = apply[0].encode('utf-8').replace('\n','')
self.plan = plan[0].encode('utf-8').replace('\n','')
self.appointmentTerm = appointmentTerm[0].encode('utf-8').replace('\n','')
self.jobStatus = jobStatus[0].encode('utf-8').replace('\n','')
self.openingDate = openingDate[0].encode('utf-8').replace('\n','')
self.salaryRange = salaryRange[0].encode('utf-8').replace('\n','')
def __str__(self):
return str(self.title)
# html = urllib.urlopen('http://jobsearch.usajobs.gov/search.aspx').read() # INERNET
# html = open('./search.html').read() # LOCAL
html = open('./pages/output').read() # LOCAL
return "%s -> %s" % (self.title, self.vacancy)
def store(self):
conn = MySQLdb.connect(host="kennethreitz.com", user="admin", passwd="drummer42", db="scraper")
cursor = conn.cursor ()
# title, closingDate, agency, location, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate, salaryRange
query = "INSERT INTO jobs (title, closing_date, agency, location, salary, details, vacancy, apply, plan, appointment_term, job_status, opening_date, salary_range) " + \
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )" % \
(conn.literal(self.title), conn.literal(self.closingDate),
conn.literal(self.agency), conn.literal(self.location),
conn.literal(self.salary), conn.literal(self.details),
conn.literal(self.vacancy), conn.literal(self.apply),
conn.literal(self.plan), conn.literal(self.appointmentTerm),
conn.literal(self.jobStatus), conn.literal(self.openingDate),
conn.literal(self.salaryRange))
cursor.execute (query)
cursor.close ()
conn.close ()
def inspect(self):
print self.title
print self.closingDate
print self.agency
print self.location
print self.salary
print self.details
print self.vacancy
print self.apply
print self.plan
print self.appointmentTerm
print self.jobStatus
print self.openingDate
print self.salaryRange
print '\n' * 4
html = open('./output').read() # LOCAL
soup = BeautifulSoup(''.join(html))
def scrapePage():
b, resultSet = [], [] # Initiate Collectors
types = ['title', 'closingDate', 'agency', 'location', 'salary', 'details']
for type in types:
b.append(getResults(type))
b = [getResults('title'), getResults('closingDate'), getResults('agency'), getResults('location'), getResults('salary'), getResults('details'), getResults('vacancy'), getResults('apply'), getResults('plan'), getResults('appointmentTerm'), getResults('jobStatus'), getResults('openingDate'), getResults('salaryRange')]
resultSet = []
for i in range(len(b[0])):
toAppend = []
for j in range(len(b)):
toAppend.append(b[j][i].contents)
resultSet.append(toAppend)
newJob = Job( b[0][i].contents, b[1][i].contents, b[2][i].contents,
b[3][i].contents, b[4][i].contents, b[5][i].contents,
b[6][i].contents, b[7][i].contents, b[8][i].contents,
b[9][i].contents, b[10][i].contents, b[11][i].contents,
b[12][i].contents
)
resultSet.append(newJob)
return resultSet
def getResults(rtype):
def getTitles():
rType = 'a'
rTitle = 'lnkTitle'
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
return soup.findAll(rType, { 'id' : rg})
def getClosingDates():
rType = 'span'
rTitle = 'lblDateMiles'
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
return soup.findAll(rType, { 'id' : rg})
def getAgencies():
rType = 'span'
rTitle = 'lblCompany'
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
return soup.findAll(rType, { 'id' : rg})
def getLocations():
rType = 'span'
rTitle = 'lblArea'
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
return soup.findAll(rType, { 'id' : rg})
def getSalaries():
rType = 'span'
rTitle = 'lblSalary'
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
def getMeta(type, title):
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (title)
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
return soup.findAll(rType, { 'id' : rg})
def getDetails():
rType = 'div'
rTitle = 'jobDetailBodyDiv'
re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
content = soup.findAll(rType, { 'id' : rg})
return soup.findAll(type, { 'id' : rg})
if rtype == 'title': return getTitles()
elif rtype == 'closingDate': return getTitles()
elif rtype == 'agency': return getTitles()
elif rtype == 'location': return getTitles()
elif rtype == 'salary': return getTitles()
elif rtype == 'details': return getTitles()
if rtype == 'title': return getMeta('a', 'lnkTitle')
elif rtype == 'closingDate': return getMeta('span', 'lblDateMiles')
elif rtype == 'agency': return getMeta('span', 'lblCompany')
elif rtype == 'location': return getMeta('span', 'lblArea')
elif rtype == 'salary': return getMeta('span', 'lblSalary')
elif rtype == 'details': return getMeta('div', 'jobDetailBodyDiv')
elif rtype == 'vacancy': return getMeta('span', 'lblVacancyAnnNumber')
elif rtype == 'apply': return getMeta('span', 'lblWhoMayApply')
elif rtype == 'plan': return getMeta('span', 'lblPayPlan')
elif rtype == 'appointmentTerm': return getMeta('span', 'lblAppointmentTerm')
elif rtype == 'jobStatus': return getMeta('span', 'lblJobStatus')
elif rtype == 'openingDate': return getMeta('span', 'lblOpeningDate')
elif rtype == 'salaryRange': return getMeta('span', 'lblSalaryRange')
if __name__ == '__main__':
for job in scrapePage():
print job
print "Storing %s..." $ (job.title)
job.store()
# print len(job)
# for t in ['title', 'closingDate', 'agency', 'location', 'salary', 'details', 'vacancy', 'apply', 'plan', 'appointmentTerm', 'jobStatus', 'openingDate', 'salaryRange']:
# print getResults(t)[0].contents[0].encode('utf-8').replace('\n','')
# print '>'
+2 -3
View File
@@ -1,8 +1,7 @@
#!/bin/bash
FILES="*"
FILES="./pages/*"
for f in "$FILES"
do
echo "Processing $f file..."
# take action on each file. $f store current file name
tidy $f > $f
tidy $f > output
done