Feb 4, 2021
1 fork
5 stars
requester = async (url) => {
// Let's be nice and set a timeout when fetching
await new Promise((resolve) => setTimeout(resolve, 500));
// Use a CORS proxy for this demonstration
return fetch(`${url.href}`);
shouldParse = allowContentType('text/html')
parser = {
const browserDomParser = new DOMParser();

// The `response` argument is the result of the previous call to `requester`
return async ({ location, response }) => {
// Get the response body
const body = await response.text();
// Parse the document using the browser's DOMParser
const document = browserDomParser.parseFromString(body, "text/html");
// Return a result object containing the URL and the document
return document;
shouldYield = async ({ parsed }) => {
const element = parsed.querySelector('h1.article_title');
return element !== null;
// The `result` argument is the result of the previous call to `requester`
follower = async function* ({ location, parsed }) {
// Find all anchors in the page
const anchors = parsed.getElementsByTagName("a");
for (const anchor of anchors) {
const href = anchor.getAttribute("href");
// Resolve the `href` relative to the current URL
yield new URL(href, location);
createShouldQueue = () => {
// The NASA Mars News URLS are of the form ""
const nasaMarsNewsUrl = /\/mars\.nasa\.gov\/news\/([\d]+)\//;
// Create a filter that matches a regular expression for the URL's `href` property
const allowUrlRegex = crawler.allowRegex(location => location.href);
// `crawler.ignoreDoubles` is a filter that keeps track of which values are visited
// In this case we find the ":id" piece in the URL and use it to detect duplicates
const ignoreMarsNewsDoubles = crawler.ignoreDoubles(location => {
const match = location.href.match(nasaMarsNewsUrl);
const newsId = match?.[1]
return newsId;
return crawler.chain(
// Only allow URLs of the NASA Mars News website
// Ignore already visited
// Allow queuing a maximum of 5 URLs this demo
allowMaximum(5, console),
crawlNasaMarsNews = async function* () {
// We need to create the `shouldQueue` function anew every time we run the crawler to reset the `allowMaximum` and `ignoreMarsNewsDoubles` filters
const shouldQueue = createShouldQueue();

// Create the crawler
const fetchCrawler = crawler.createCrawler({
// Let's log some info to the console
logger: console,

// This is the entrypoint
const entrypoint = new URL('');

// And start crawling!
for await (const { location, parsed } of fetchCrawler(entrypoint)) {
// Find the title element in the page
const titleElement = parsed.querySelector('h1.article_title');
// And extract the text of the title element
const title = titleElement?.textContent?.trim();

yield {
url: location.href,
viewof titlesView = {
// Call the asynchronous iterator and collect the results into an array
const results = await collect(crawlNasaMarsNews());
const resultsWithTitle = results.filter(result => !!result.title);

const renderResult = result => `<li><a href="${result.url}">${result.title}</a></li>`;

return html`
<p>The following titles were just scraped from the NASA Mars News website 🥳</p>
collect = async function (iterator) {
const array = [];
for await (const item of iterator) {
return array;
allowMaximum = (count, logger) => () => {
if (count-- > 0) {
logger?.info(`Allowing ${count} more requests...`);
return count > 0;
return false;
allowContentType = (allowedContentType) => ({ response }) => {
const contentType = response.headers.get('content-type');
return contentType.startsWith(allowedContentType);
crawler = import("")
