Published
Edited
Feb 4, 2021
1 fork
5 stars
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
requester = async (url) => {
// Let's be nice and set a timeout when fetching
await new Promise((resolve) => setTimeout(resolve, 500));
// Use a CORS proxy for this demonstration
return fetch(`https://api.codetabs.com/v1/proxy/?quest=${url.href}`);
}
Insert cell
Insert cell
shouldParse = allowContentType('text/html')
Insert cell
Insert cell
parser = {
const browserDomParser = new DOMParser();

// The `response` argument is the result of the previous call to `requester`
return async ({ location, response }) => {
// Get the response body
const body = await response.text();
// Parse the document using the browser's DOMParser
const document = browserDomParser.parseFromString(body, "text/html");
// Return a result object containing the URL and the document
return document;
};
}
Insert cell
Insert cell
shouldYield = async ({ parsed }) => {
const element = parsed.querySelector('h1.article_title');
return element !== null;
}
Insert cell
Insert cell
// The `result` argument is the result of the previous call to `requester`
follower = async function* ({ location, parsed }) {
// Find all anchors in the page
const anchors = parsed.getElementsByTagName("a");
for (const anchor of anchors) {
const href = anchor.getAttribute("href");
// Resolve the `href` relative to the current URL
yield new URL(href, location);
}
}
Insert cell
Insert cell
createShouldQueue = () => {
// The NASA Mars News URLS are of the form "mars.nasa.gov/news/:id"
const nasaMarsNewsUrl = /\/mars\.nasa\.gov\/news\/([\d]+)\//;
// Create a filter that matches a regular expression for the URL's `href` property
const allowUrlRegex = crawler.allowRegex(location => location.href);
// `crawler.ignoreDoubles` is a filter that keeps track of which values are visited
// In this case we find the ":id" piece in the URL and use it to detect duplicates
const ignoreMarsNewsDoubles = crawler.ignoreDoubles(location => {
const match = location.href.match(nasaMarsNewsUrl);
const newsId = match?.[1]
return newsId;
});
return crawler.chain(
// Only allow URLs of the NASA Mars News website
allowUrlRegex([nasaMarsNewsUrl]),
// Ignore already visited
ignoreMarsNewsDoubles(),
// Allow queuing a maximum of 5 URLs this demo
allowMaximum(5, console),
);
}
Insert cell
Insert cell
crawlNasaMarsNews = async function* () {
// We need to create the `shouldQueue` function anew every time we run the crawler to reset the `allowMaximum` and `ignoreMarsNewsDoubles` filters
const shouldQueue = createShouldQueue();

// Create the crawler
const fetchCrawler = crawler.createCrawler({
requester,
shouldParse,
parser,
shouldYield,
follower,
shouldQueue,
// Let's log some info to the console
logger: console,
});

// This is the entrypoint
const entrypoint = new URL('https://mars.nasa.gov/news');

// And start crawling!
for await (const { location, parsed } of fetchCrawler(entrypoint)) {
// Find the title element in the page
const titleElement = parsed.querySelector('h1.article_title');
// And extract the text of the title element
const title = titleElement?.textContent?.trim();

yield {
url: location.href,
title,
};
}
}
Insert cell
viewof titlesView = {
// Call the asynchronous iterator and collect the results into an array
const results = await collect(crawlNasaMarsNews());
const resultsWithTitle = results.filter(result => !!result.title);

const renderResult = result => `<li><a href="${result.url}">${result.title}</a></li>`;

return html`
<p>The following titles were just scraped from the NASA Mars News website 🥳</p>
<div>
<ul>
${results.map(renderResult)}
</ul>
</div>`;
}
Insert cell
Insert cell
Insert cell
collect = async function (iterator) {
const array = [];
for await (const item of iterator) {
array.push(item);
}
return array;
}
Insert cell
Insert cell
allowMaximum = (count, logger) => () => {
if (count-- > 0) {
logger?.info(`Allowing ${count} more requests...`);
return count > 0;
}
return false;
}
Insert cell
Insert cell
allowContentType = (allowedContentType) => ({ response }) => {
const contentType = response.headers.get('content-type');
return contentType.startsWith(allowedContentType);
}
Insert cell
crawler = import("https://cdn.skypack.dev/crawler-ts@1.1.1")
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more