convore.json/groups/interesting-papers/text-extraction/messages.json


			
				
					
					
						
						
							
							
							[{"user_id": 14944, "stars": [{"date_created": 1300716307.8073821, "user_id": 7376}, {"date_created": 1300749069.892334, "user_id": 23389}, {"date_created": 1300753328.405935, "user_id": 1128}, {"date_created": 1300778066.7209351, "user_id": 14074}, {"date_created": 1300785359.5412381, "user_id": 21923}, {"date_created": 1300975083.262507, "user_id": 13912}, {"date_created": 1302101133.6203251, "user_id": 27739}, {"date_created": 1302543144.913444, "user_id": 15343}, {"date_created": 1302974962.360584, "user_id": 29843}, {"date_created": 1303300465.1854961, "user_id": 27245}], "topic_id": 14045, "date_created": 1300608618.015507, "message": "http://tomazkovacic.com/blog/14/extracting-article-text-from-html-documents", "group_id": 12, "id": 391280}, {"user_id": 506, "stars": [{"date_created": 1302543149.371825, "user_id": 15343}], "topic_id": 14045, "date_created": 1300747915.5139811, "message": "This is a surprisingly in-depth article on different techniques for scraping the contents of HTML documents and preserving the overall page layout and structure (header, sidebars, main content, footers, etc..). Definitely worth a read for the discussion on algorithms and also Boilerplate library recommendation.", "group_id": 12, "id": 403357}, {"user_id": 10661, "stars": [], "topic_id": 14045, "date_created": 1302475462.0131199, "message": "thank you, this blog article actually offers a good overview over different techniques for \"content classification\" to extract the actual content/article from an HTML source code.", "group_id": 12, "id": 615073}]