Published
Edited
Feb 4, 2021
1 fork
5 stars
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
requester = async (url) => {
// Let's be nice and set a timeout when fetching
await new Promise((resolve) => setTimeout(resolve, 500));
// Use a CORS proxy for this demonstration
return fetch(`https://api.codetabs.com/v1/proxy/?quest=${url.href}`);
}
Insert cell
Insert cell
shouldParse = allowContentType('text/html')
Insert cell
Insert cell
parser = {
const browserDomParser = new DOMParser();

// The `response` argument is the result of the previous call to `requester`
return async ({ location, response }) => {
// Get the response body
const body = await response.text();
// Parse the document using the browser's DOMParser
const document = browserDomParser.parseFromString(body, "text/html");
// Return a result object containing the URL and the document
return document;
};
}
Insert cell
Insert cell
shouldYield = async ({ parsed }) => {
const element = parsed.querySelector('h1.article_title');
return element !== null;
}
Insert cell
Insert cell
// The `result` argument is the result of the previous call to `requester`
follower = async function* ({ location, parsed }) {
// Find all anchors in the page
const anchors = parsed.getElementsByTagName("a");
for (const anchor of anchors) {
const href = anchor.getAttribute("href");
// Resolve the `href` relative to the current URL
yield new URL(href, location);
}
}
Insert cell
Insert cell
createShouldQueue = () => {
// The NASA Mars News URLS are of the form "mars.nasa.gov/news/:id"
const nasaMarsNewsUrl = /\/mars\.nasa\.gov\/news\/([\d]+)\//;
// Create a filter that matches a regular expression for the URL's `href` property
const allowUrlRegex = crawler.allowRegex(location => location.href);
// `crawler.ignoreDoubles` is a filter that keeps track of which values are visited
// In this case we find the ":id" piece in the URL and use it to detect duplicates
const ignoreMarsNewsDoubles = crawler.ignoreDoubles(location => {
const match = location.href.match(nasaMarsNewsUrl);
const newsId = match?.[1]
return newsId;
});
return crawler.chain(
// Only allow URLs of the NASA Mars News website
allowUrlRegex([nasaMarsNewsUrl]),
// Ignore already visited
ignoreMarsNewsDoubles(),
// Allow queuing a maximum of 5 URLs this demo
allowMaximum(5, console),
);
}
Insert cell
Insert cell
crawlNasaMarsNews = async function* () {
// We need to create the `shouldQueue` function anew every time we run the crawler to reset the `allowMaximum` and `ignoreMarsNewsDoubles` filters
const shouldQueue = createShouldQueue();

// Create the crawler
const fetchCrawler = crawler.createCrawler({
requester,
shouldParse,
parser,
shouldYield,
follower,
shouldQueue,
// Let's log some info to the console
logger: console,
});

// This is the entrypoint
const entrypoint = new URL('https://mars.nasa.gov/news');

// And start crawling!
for await (const { location, parsed } of fetchCrawler(entrypoint)) {
// Find the title element in the page
const titleElement = parsed.querySelector('h1.article_title');
// And extract the text of the title element
const title = titleElement?.textContent?.trim();

yield {
url: location.href,
title,
};
}
}
Insert cell
viewof titlesView = {
// Call the asynchronous iterator and collect the results into an array
const results = await collect(crawlNasaMarsNews());
const resultsWithTitle = results.filter(result => !!result.title);

const renderResult = result => `<li><a href="${result.url}">${result.title}</a></li>`;

return html`
<p>The following titles were just scraped from the NASA Mars News website 🥳</p>
<div>
<ul>
${results.map(renderResult)}
</ul>
</div>`;
}
Insert cell
Insert cell
Insert cell
collect = async function (iterator) {
const array = [];
for await (const item of iterator) {
array.push(item);
}
return array;
}
Insert cell
Insert cell
allowMaximum = (count, logger) => () => {
if (count-- > 0) {
logger?.info(`Allowing ${count} more requests...`);
return count > 0;
}
return false;
}
Insert cell
Insert cell
allowContentType = (allowedContentType) => ({ response }) => {
const contentType = response.headers.get('content-type');
return contentType.startsWith(allowedContentType);
}
Insert cell
crawler = import("https://cdn.skypack.dev/crawler-ts@1.1.1")
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more