Files
Kenneth Reitz c2912820e2 import fix
2011-01-30 18:18:31 -05:00

172 lines
4.3 KiB
Python

# -*- coding: utf-8 -*-
import cookielib
import datetime
import os
import urllib2
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse as dtime
from django.core.management import setup_environ
#from django.db.utils import IntegrityError
from wincstar import settings
#os.environ['DJANGO_SETTINGS_MODULE'] = 'wincstar.settings'
#setup_environ(settings)
from django.core.management.base import BaseCommand, CommandError
from wincstar.ripper.models import Article as DjangoArticle
YEAR = str(datetime.datetime.now().year)
class Article(object):
"""An article."""
def __init__(self):
self.slug = None
self.title = None
self.subtitle = None
self.published = None
self.author = None
self.content = None
self.url = None
def to_django(self):
art = DjangoArticle()
art.title = self.title
art.subtitle = self.subtitle
print self.published
art.published = dtime(self.published)
art.author = self.author
art.content = self.content
art.ourl = self.url
art.slug = self.slug
if len(self.content) > 1000:
art.save()
else:
print('%s had no usable content.' % (self.title if self.title else ''))
def get_articles(url='http://www.winchesterstar.com/members/login'):
opener = urllib2.build_opener(
urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
opener.addheaders.append(
('Content-Type', 'application/x-www-form-urlencoded'))
login_data = (
r'_method=POST&data%5BMember%5D%5Bemail'
r'%5D=thepythonist%40gmail.com&data%5BMember'
r'%5D%5Bpassword%5D=UXe1b&data%5BMember%5D%5Bremember%5D=0'
)
opener.open('http://www.winchesterstar.com/members/login', login_data).read()
content = opener.open(url, login_data).read()
soup = BeautifulSoup(content)
links = []
for link in [str(l['href']) for l in soup.findAll('a') if
'homepage_links' in l['href']]:
c = opener.open('http://www.winchesterstar.com/%s' % link)
links.append(c.geturl())
return links
def parse_article(content):
"""Returns article object from given article content."""
article = Article()
art = max(BeautifulSoup(content).findAll('td'), key=len)
article.title = art.find('h2').text
article.published = art.findNext('div').text.split('By')[0].split(YEAR)[0] + YEAR
_content = max(str(art).split('<hr />'), key=len).lstrip().split('</style>')[-1].lstrip()
article.content = BeautifulSoup(_content).prettify()
try:
article.subtitle = art.find('h3').text
except AttributeError:
article.subtitle = None
try:
article.author = art.find('div').find('div').find('em').text.replace('By ', '')
except AttributeError:
pass
return article
def date_range(start_date):
"""
Returns a generator of all the days between two date objects.
Results include the start and end dates.
Arguments can be either datetime.datetime or date type objects.
h3. Example usage
>>> import datetime
>>> import calculate
>>> dr = calculate.date_range(datetime.date(2009,1,1), datetime.date(2009,1,3))
>>> dr
<generator object="object" at="at">
>>> list(dr)
[datetime.date(2009, 1, 1), datetime.date(2009, 1, 2), datetime.date(2009, 1, 3)]
"""
# If a datetime object gets passed in,
# change it to a date so we can do comparisons.
end_date = datetime.datetime.now()
if isinstance(start_date, datetime.datetime):
start_date = start_date.date()
if isinstance(end_date, datetime.datetime):
end_date = end_date.date()
# Verify that the start_date comes after the end_date.
if start_date > end_date:
raise ValueError('You provided a start_date that comes after the end_date.')
# Jump forward from the start_date...
while True:
yield start_date
# ... one day at a time ...
start_date = start_date + datetime.timedelta(days=1)
# ... until you reach the end date.
if start_date > end_date:
break
class Command(BaseCommand):
args = ''
help = 'Imports feeds.'
def handle(self, *args, **options):
for date in date_range(datetime.datetime.now()):
print 'Grabbing %s' % (date)
for url in get_articles('http://www.winchesterstar.com/pages/choose_edition/date:%s' % date):
page_content = urllib2.urlopen(url).read()
article = parse_article(page_content)
article.url = url
article.slug = '%s-%s' % (date, url.split('/')[-1].replace('_', '-'))
try:
article.to_django()
except Exception, why:
# print why
print '%s already exists.' % (article.title)
print 'Grabbing: %s' % (article.title)