Update

2026-06-05 23:40:16 +00:00 · 2009-11-03 17:48:26 -05:00
parent 464fa4087e
commit ba4276e7cb
3 changed files with 1109146 additions and 73 deletions
@@ -2,89 +2,104 @@ from BeautifulSoup import BeautifulSoup
 from kenlib import *
 import re
 import urllib
+import MySQLdb

 class Job():
-	def __init__(self, title="", closingDate="", agency="", location="", salary=""):
-		self.title = title
-		self.closingDate = closingDate
-		self.agency = agency
-		self.location = location
-		self.salary = salary
-		# self.details = details
+	def __init__(self, title, closingDate, agency, location, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate, salaryRange):
+		self.title = title[0].encode('utf-8').replace('\n','')
+		self.closingDate = closingDate[0].encode('utf-8').replace('\n','')
+		self.agency = agency[0].encode('utf-8').replace('\n','')
+		self.location = location[0].encode('utf-8').replace('\n','')
+		self.salary = salary[0].encode('utf-8').replace('\n','')
+		self.details = details[0].encode('utf-8').replace('\n','')
+		self.vacancy = vacancy[0].encode('utf-8').replace('\n','')
+		self.apply = apply[0].encode('utf-8').replace('\n','')
+		self.plan = plan[0].encode('utf-8').replace('\n','')
+		self.appointmentTerm = appointmentTerm[0].encode('utf-8').replace('\n','')
+		self.jobStatus = jobStatus[0].encode('utf-8').replace('\n','')
+		self.openingDate = openingDate[0].encode('utf-8').replace('\n','')
+		self.salaryRange = salaryRange[0].encode('utf-8').replace('\n','')
+		
 	def __str__(self):
-		return str(self.title)  
-
-# html = urllib.urlopen('http://jobsearch.usajobs.gov/search.aspx').read() 	# INERNET 
-# html = open('./search.html').read() 	# LOCAL
-html = open('./pages/output').read() 	# LOCAL
+		return "%s -> %s" % (self.title, self.vacancy) 
+		
+	def store(self):
+		conn = MySQLdb.connect(host="kennethreitz.com", user="admin", passwd="drummer42", db="scraper")
+		cursor = conn.cursor ()
+		# title, closingDate, agency, location, salary, details, vacancy, apply, plan, appointmentTerm, jobStatus, openingDate, salaryRange
+		query = "INSERT INTO jobs (title, closing_date, agency, location, salary, details, vacancy, apply, plan, appointment_term, job_status, opening_date, salary_range) " + \
+		"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )" % \
+		(conn.literal(self.title), conn.literal(self.closingDate), 
+		 conn.literal(self.agency), conn.literal(self.location), 
+		 conn.literal(self.salary), conn.literal(self.details), 
+		 conn.literal(self.vacancy), conn.literal(self.apply), 
+		 conn.literal(self.plan), conn.literal(self.appointmentTerm),
+		 conn.literal(self.jobStatus), conn.literal(self.openingDate), 
+		 conn.literal(self.salaryRange))
+		cursor.execute (query)
+		cursor.close ()
+		conn.close ()
+		
+	def inspect(self):
+		print self.title
+		print self.closingDate
+		print self.agency
+		print self.location
+		print self.salary
+		print self.details
+		print self.vacancy
+		print self.apply
+		print self.plan
+		print self.appointmentTerm
+		print self.jobStatus
+		print self.openingDate
+		print self.salaryRange
+		print '\n' * 4
+		
+html = open('./output').read() 	# LOCAL
 soup = BeautifulSoup(''.join(html))

 def scrapePage():
-	b, resultSet = [], []	# Initiate Collectors
-	types = ['title', 'closingDate', 'agency', 'location', 'salary', 'details']
-	for type in types:
-		b.append(getResults(type))
-
+	b = [getResults('title'), getResults('closingDate'), getResults('agency'), getResults('location'), getResults('salary'), getResults('details'), getResults('vacancy'), getResults('apply'), getResults('plan'), getResults('appointmentTerm'), getResults('jobStatus'), getResults('openingDate'), getResults('salaryRange')]
+	resultSet = []
 	for i in range(len(b[0])):
-		toAppend = []
-		for j in range(len(b)):
-			toAppend.append(b[j][i].contents)
-		resultSet.append(toAppend)
-
+		newJob 	= 	Job(	b[0][i].contents, b[1][i].contents, b[2][i].contents, 
+								b[3][i].contents, b[4][i].contents, b[5][i].contents, 
+								b[6][i].contents, b[7][i].contents, b[8][i].contents, 
+								b[9][i].contents, b[10][i].contents, b[11][i].contents, 
+								b[12][i].contents
+							 )
+		resultSet.append(newJob)
+		
 	return resultSet

 def getResults(rtype):
-	def getTitles():
-		rType = 'a'
-		rTitle = 'lnkTitle'
-		re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
-		rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
-		return soup.findAll(rType, { 'id' : rg})
-		
-	def getClosingDates():
-		rType = 'span'
-		rTitle = 'lblDateMiles'
-		re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
-		rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
-		return soup.findAll(rType, { 'id' : rg})
-		
-	def getAgencies():
-		rType = 'span'
-		rTitle = 'lblCompany'
-		re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
-		rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
-		return soup.findAll(rType, { 'id' : rg})
-		
-	def getLocations():
-		rType = 'span'
-		rTitle = 'lblArea'
-		re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
-		rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
-		return soup.findAll(rType, { 'id' : rg})
 	
-	def getSalaries():
-		rType = 'span'
-		rTitle = 'lblSalary'
-		re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
+	def getMeta(type, title):
+		re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (title)
 		rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
-		return soup.findAll(rType, { 'id' : rg})
-			
-	def getDetails():
-		rType = 'div'
-		rTitle = 'jobDetailBodyDiv'
-		re0='MasterPage1_middleContent__ctlResultsFlat_rptResults__ctl(\d)(\d)?_%s$' % (rTitle)
-		rg = re.compile(re0,re.IGNORECASE|re.DOTALL)
-		content = soup.findAll(rType, { 'id' : rg})
+		return soup.findAll(type, { 'id' : rg})

-	if	   rtype == 'title': 		return getTitles()
-	elif	rtype == 'closingDate':	return getTitles()
-	elif	rtype == 'agency': 		return getTitles()
-	elif	rtype == 'location': 	return getTitles()
-	elif	rtype == 'salary':		return getTitles()
-	elif	rtype == 'details':		return getTitles()
+	if	  rtype == 'title': 	  			return getMeta('a', 'lnkTitle')
+	elif rtype == 'closingDate':		return getMeta('span', 'lblDateMiles')
+	elif rtype == 'agency': 	  		return getMeta('span', 'lblCompany')
+	elif rtype == 'location':   		return getMeta('span', 'lblArea')
+	elif rtype == 'salary':	  			return getMeta('span', 'lblSalary')
+	elif rtype == 'details':	  		return getMeta('div', 'jobDetailBodyDiv')
+	elif rtype == 'vacancy':	  		return getMeta('span', 'lblVacancyAnnNumber')
+	elif rtype == 'apply':		  		return getMeta('span', 'lblWhoMayApply')
+	elif rtype == 'plan':		  		return getMeta('span', 'lblPayPlan')
+	elif rtype == 'appointmentTerm':	return getMeta('span', 'lblAppointmentTerm')
+	elif rtype == 'jobStatus':			return getMeta('span', 'lblJobStatus')
+	elif rtype == 'openingDate':		return getMeta('span', 'lblOpeningDate')
+	elif rtype == 'salaryRange':		return getMeta('span', 'lblSalaryRange')
+	
 	
 if __name__ == '__main__':
 	for job in scrapePage():
-		print job
-	
-	
+		print "Storing %s..." $ (job.title)
+		job.store()
+		# print len(job)
+	# for t in ['title', 'closingDate', 'agency', 'location', 'salary', 'details', 'vacancy', 'apply', 'plan', 'appointmentTerm', 'jobStatus', 'openingDate', 'salaryRange']:
+		# print getResults(t)[0].contents[0].encode('utf-8').replace('\n','')
+		# print '>'
@@ -1,8 +1,7 @@
 #!/bin/bash
-FILES="*"
+FILES="./pages/*"
 for f in "$FILES"
 do
  echo "Processing $f file..."
-  # take action on each file. $f store current file name
-  tidy $f > $f
+  tidy $f > output
 done