I had a talk "Data Mining Toolbox" in University of Cantabria, Santander, Spain organized by meetup group "Data Science Santander" on November 9th, 2017
It was my first public presentation on this topic where I accumulated some my experience in html data extraction and processing.
Code snippets used in presentation
For convenient access I put all code snippets below instead of creating new repo on github
Basic example with reddit
$('.thing.link').map((k, i) => {
var i = $(i);
return {
title: i.find(".title a.title").text(),
url: i.find(".title a.title").attr("href"),
score: i.find(".score.unvoted").text(),
subreddit: i.find(".subreddit").text()
}
}).toArray()
And
window.open().document.write(JSON.stringify(a))
To create new browser window and put there json representation of the javascript array
Pagination in reddit
As it was shown it is possible to parse not only currently loaded page in the browser but get number of the pages
function _page(n, afterId) {
$.get("https://www.reddit.com/?count=10&after=" + afterId, data => {
var result = $(data).find('.thing.link').map((k, i) => {
var i = $(i);
return {
id: i.attr('data-fullname'),
title: i.find("p.title a.title").text(),
url: i.find("p.title a.title").attr("href"),
score: i.find(".score.unvoted").text(),
subreddit: i.find(".subreddit").text()
}
}).toArray()
console.log("page", n, "=>", result)
a.push(result);
if (n <= 0) return;
_page(n - 1, result[result.length-1].id)
})
}
Code snippet from real project
Code snippet bellow potentially can get extract from the StackOverflow job page including tags and optional salary. Published for educational purposes only
fun fetchRecentPage(): List<StackOverflowJob> {
val jobs = jsoupGetDocument(url).select(".jobs .listResults > div[data-jobid]")
return jobs.map {
val job = StackOverflowJob(
wipeoutHtml(it.attr("data-jobid").trim()),
normalizeUrl(wipeoutHtml(it.select(".-title a.job-link").attr("href").trim())),
it.select(".-title h2").first()?.text()!!,
it.select("p.description").text()
)
job.location = it.select("li.location")?.text()?.trim()
job.tags = it.select(".tags a").orEmpty().mapNotNull {
it.text()?.trim()?.let { wipeoutHtml(it) }
}
job.company = it.select("li.employer").first()?.text()?.trim()
job.salary = it.select(".salary").first()?.text()?.trim()
job
}
}
In this example kotlin and jsoup are used