This commit is contained in:
2016-02-13 10:28:20 -05:00
parent e643f20d7c
commit e240f4dcf2
6 changed files with 37 additions and 138 deletions
+3 -14
View File
@@ -3,28 +3,18 @@
This is a very simple web service that will take a given URL, and return
a Markdown representation of that page.
Powered by [Readability](http://readability.com/),
[Requests](http://python-guide.org/),
[html2text](http://www.aaronsw.com/2002/html2text/),
[markdown](http://pythonhosted.org/Markdown/),
and [Flask](http://flask.pocoo.org/).
Powered by [Readability](http://readability.com/), [Requests](http://python-guide.org/), [html2text](http://www.aaronsw.com/2002/html2text/), and [Flask](http://flask.pocoo.org/).
## Usage
$ curl http://url2markdown.herokuapp.com/?url=http://kennethreitz.org
# Hi, there.
My name is Kenneth Reitz.
...
Or, if you understand code:
$ mkvirtualenv url2markdown
$ pip install -r requirements.txt
$ READABILITY_TOKEN="XXX" python service.py
Enjoy!
## Configuration
@@ -38,4 +28,3 @@ You can use [autoenv](https://github.com/kennethreitz/autoenv) to do this easily
## License
Unfortunately, this code is released under [GPLv3](http://www.gnu.org/copyleft/gpl.html).
+13 -21
View File
@@ -1,44 +1,36 @@
# -*- coding: utf-8 -*-
import os
import requests
from html2text import html2text
READABILITY_URL = 'https://www.readability.com/api/content/v1/parser'
_READABILITY_URL = 'https://www.readability.com/api/content/v1/parser'
def _get_readability_html_and_title(url):
def readability(url):
token = os.environ.get('READABILITY_TOKEN')
params = {'url': url, 'token': token}
r = requests.get(_READABILITY_URL, params=params)
decoded_content = (
r.json()['content'],
r.json()['title'],
)
return decoded_content
r = requests.get(READABILITY_URL, params=params)
return r.json()['content'], r.json()['title']
def _convert_html_to_markdown(html, title=None):
def convert(html, title=None):
if title:
title = '# {}'.format(title)
html = '\n\n'.join([title, html])
text_from_html = html2text(html)
return text_from_html
return html2text(html)
def get_readable_content_from_url(url):
def meh(url):
try:
content, title = _get_readability_html_and_title(url)
markdown = _convert_html_to_markdown(content, title=title)
return markdown
content, title = readability(url)
return convert(content, title=title)
except KeyError:
return None
if __name__ == '__main__':
print get_readable_content_from_url('http://kennethreitz.org/')
print meh('http://kennethreitz.org/')
-1
View File
@@ -8,4 +8,3 @@ html2text==3.200.3
itsdangerous==0.23
requests==2.0.0
wsgiref==0.1.2
markdown==2.3.1
+6 -27
View File
@@ -1,40 +1,19 @@
# -*- coding: utf-8 -*-
from flask import Flask, request, render_template, Markup
from converter import get_readable_content_from_url
from markdown import markdown
from flask import Flask, request, redirect, url_for, render_template
from converter import meh
app = Flask(__name__)
def _markdown_to_html(text):
return Markup(markdown(text))
@app.route('/')
def fuck_gpl3():
url = request.args.get('url')
type = request.args.get('type', 'markdown')
content = get_readable_content_from_url(url)
if url:
if not content:
return '404 Not Found', 404
if type == 'html':
markdown_url_contents = _markdown_to_html(content)
return render_template(
'index.html',
converted_url_contents=markdown_url_contents,
page_url=url,
)
else:
content = meh(url)
if content:
return content, 200, {'Content-Type': 'text/x-markdown; charset=UTF-8'}
else:
return '404 Not Found', 404
else:
return render_template('index.html')
if __name__ == '__main__':
app.run()
-9
View File
File diff suppressed because one or more lines are too long
+15 -66
View File
@@ -1,72 +1,21 @@
<!DOCTYPE html>
<html>
<head>
<title>Markdown Please!</title>
<link
href="{{ url_for('static', filename='css/bootstrap.min.css' ) }}"
rel="stylesheet"
/>
</head>
<body>
<nav class="navbar navbar-default navbar-fixed-top">
<div class="container">
<ul class="nav">
<a class="navbar-brand" href="/">Markdown Please!</a>
<head>
<title>url2markdown</title>
</head>
<body>
<h1>url2markdown</h1>
<p>This is a very simple web service that will take a given URL, and return
a Markdown representation of that page.</p>
<form action="/" method="get" class="navbar-form navbar-left">
<fieldset>
<input
type="text"
name="url"
class="form-control"
style="width: 300px;"
placeholder="http://en.wikipedia.org/wiki/Markdown"
{% if page_url %}
value="{{ page_url }}"
{% endif %}
/>
<input type="hidden" name="type" value="html" />
<button type="submit" class="btn btn-default">
Go
</button>
</fieldset>
</form>
<form action="/" method="get">
URL: <input type="text" name="url">
<button type='submit'>Submit</button>
</form>
{% if page_url %}
<a
href="/?url={{ page_url }}"
class="btn btn-default navbar-btn navbar-right"
>
Get the Markdown
</a>
{% endif %}
</ul>
</div>
</nav>
<p>A <a href='http://kennethreitz.org/projects/'>Kenneth Reitz</a> project.</p>
<div class="container">
<div style="padding-top:40px;padding-bottom:80px;">
{% if page_url %}
<div>
{{ converted_url_contents }}
</div>
{% endif %}
<a href="https://github.com/kennethreitz/url2markdown"><img style="position: absolute; top: 0; right: 0; border: 0;" src="https://s3.amazonaws.com/github/ribbons/forkme_right_darkblue_121621.png" alt="Fork me on GitHub"></a>
<a href="https://github.com/kennethreitz/url2markdown">
<img
style="position: absolute; top: 0; right: 0; border: 0; z-index: 10000;"
src="https://s3.amazonaws.com/github/ribbons/forkme_right_darkblue_121621.png"
alt="Fork me on GitHub"
/>
</a>
</div>
</div>
<footer class="navbar navbar-fixed-bottom panel-footer">
<p class="container">
A <a href="http://kennethreitz.org/projects/">Kenneth Reitz</a>/
<a href="http://lumbercoder.com/">Gil Goncalves</a> project.
</p>
</footer>
</body>
</html>
</body>
</html>