Project: CLI Web Crawler Tool
In this project, we will build a command-line interface (CLI) crawler that fetches web pages, extracts hyperlinks using regular expressions, and writes the results to a structured JSON file.
1. Project Specifications
- Target: Download page markup from a specified URL.
- Extraction: Extract all unique anchor tags and absolute links.
- Output: Save the results into a file named
links.json. - Execution: Run from the command-line by passing the target URL as an argument.
2. Implementing the Crawler
Create a new file named crawler.js and paste the following implementation:
import https from "https";
import fs from "fs/promises";
import url from "url";
// Get command-line arguments (node crawler.js <url>)
const targetUrl = process.argv[2];
if (!targetUrl) {
console.error("Error: Please provide a target URL.");
console.error("Usage: node crawler.js https://example.com");
process.exit(1);
}
// Helper function to fetch HTML content
function fetchHtml(target) {
return new Promise((resolve, reject) => {
https.get(target, (res) => {
if (res.statusCode !== 200) {
reject(new Error(`Failed to fetch page, status code: ${res.statusCode}`));
return;
}
let rawData = "";
res.on("data", (chunk) => {
rawData += chunk.toString();
});
res.on("end", () => {
resolve(rawData);
});
}).on("error", (err) => {
reject(err);
});
});
}
// Helper function to extract links using regex
function extractLinks(html, baseUrl) {
const linkRegex = /href="([^"]+)"/g;
const uniqueLinks = new Set();
let match;
while ((match = linkRegex.exec(html)) !== null) {
const rawLink = match[1];
try {
// Resolve relative links to absolute URLs
const absoluteUrl = new url.URL(rawLink, baseUrl).href;
uniqueLinks.add(absoluteUrl);
} catch (e) {
// Ignore invalid URLs
}
}
return Array.from(uniqueLinks);
}
// Main runner execution
async function main() {
try {
console.log(`Fetching page: ${targetUrl}...`);
const html = await fetchHtml(targetUrl);
console.log("Analyzing page markup...");
const links = extractLinks(html, targetUrl);
console.log(`Found ${links.length} unique links.`);
const outputData = JSON.stringify({
url: targetUrl,
crawledAt: new Date().toISOString(),
links
}, null, 2);
await fs.writeFile("links.json", outputData, "utf8");
console.log("Results successfully saved to links.json");
} catch (error) {
console.error("Crawler failed:", error.message);
process.exit(1);
}
}
main();3. Running the CLI Crawler
To test the crawler, run the script against a public website:
# Run crawler targeting a website
node crawler.js https://www.wikipedia.orgOnce execution completes, inspect the generated links.json output:
{
"url": "https://www.wikipedia.org",
"crawledAt": "2026-06-16T03:15:00.000Z",
"links": [
"https://en.wikipedia.org/",
"https://ja.wikipedia.org/",
"https://es.wikipedia.org/"
]
}Published on Last updated: