Introduction to crawler-ts / Gillis Van Ginderachter

Gillis Van Ginderachter

Workspace

Published

Edited

Feb 4, 2021

1 fork

5 stars

requester = async (url) => {

// Let's be nice and set a timeout when fetching

await new Promise((resolve) => setTimeout(resolve, 500));

// Use a CORS proxy for this demonstration

return fetch(`https://api.codetabs.com/v1/proxy/?quest=${url.href}`);

}

shouldParse = allowContentType('text/html')

parser = {

const browserDomParser = new DOMParser();

// The `response` argument is the result of the previous call to `requester`

return async ({ location, response }) => {

// Get the response body

const body = await response.text();

// Parse the document using the browser's DOMParser

const document = browserDomParser.parseFromString(body, "text/html");

// Return a result object containing the URL and the document

return document;

};

}

shouldYield = async ({ parsed }) => {

const element = parsed.querySelector('h1.article_title');

return element !== null;

}

// The `result` argument is the result of the previous call to `requester`

follower = async function* ({ location, parsed }) {

// Find all anchors in the page

const anchors = parsed.getElementsByTagName("a");

for (const anchor of anchors) {

const href = anchor.getAttribute("href");

// Resolve the `href` relative to the current URL

yield new URL(href, location);

}

createShouldQueue = () => {

// The NASA Mars News URLS are of the form "mars.nasa.gov/news/:id"

const nasaMarsNewsUrl = /\/mars\.nasa\.gov\/news\/([\d]+)\//;

// Create a filter that matches a regular expression for the URL's `href` property

const allowUrlRegex = crawler.allowRegex(location => location.href);

// `crawler.ignoreDoubles` is a filter that keeps track of which values are visited

// In this case we find the ":id" piece in the URL and use it to detect duplicates

const ignoreMarsNewsDoubles = crawler.ignoreDoubles(location => {

const match = location.href.match(nasaMarsNewsUrl);

const newsId = match?.[1]

return newsId;

});

return crawler.chain(

// Only allow URLs of the NASA Mars News website

allowUrlRegex([nasaMarsNewsUrl]),

// Ignore already visited

ignoreMarsNewsDoubles(),

// Allow queuing a maximum of 5 URLs this demo

allowMaximum(5, console),

);

}

crawlNasaMarsNews = async function* () {

// We need to create the `shouldQueue` function anew every time we run the crawler to reset the `allowMaximum` and `ignoreMarsNewsDoubles` filters

const shouldQueue = createShouldQueue();

// Create the crawler

const fetchCrawler = crawler.createCrawler({

requester,

shouldParse,

parser,

shouldYield,

follower,

shouldQueue,

// Let's log some info to the console

logger: console,

});

// This is the entrypoint

const entrypoint = new URL('https://mars.nasa.gov/news');

// And start crawling!

for await (const { location, parsed } of fetchCrawler(entrypoint)) {

// Find the title element in the page

const titleElement = parsed.querySelector('h1.article_title');

// And extract the text of the title element

const title = titleElement?.textContent?.trim();

yield {

url: location.href,

title,

};

}

viewof titlesView = {

// Call the asynchronous iterator and collect the results into an array

const results = await collect(crawlNasaMarsNews());

const resultsWithTitle = results.filter(result => !!result.title);

const renderResult = result => `<li><a href="${result.url}">${result.title}</a></li>`;

return html`

<p>The following titles were just scraped from the NASA Mars News website 🥳</p>

<div>

<ul>

${results.map(renderResult)}

</ul>

</div>`;

}

collect = async function (iterator) {

const array = [];

for await (const item of iterator) {

array.push(item);

}

return array;

}

allowMaximum = (count, logger) => () => {

if (count-- > 0) {

logger?.info(`Allowing ${count} more requests...`);

return count > 0;

}

return false;

}

allowContentType = (allowedContentType) => ({ response }) => {

const contentType = response.headers.get('content-type');

return contentType.startsWith(allowedContentType);

}

crawler = import("https://cdn.skypack.dev/crawler-ts@1.1.1")

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more