Back to roadmaps nodejs Course

Project: CLI Web Crawler Tool

In this project, we will build a command-line interface (CLI) crawler that fetches web pages, extracts hyperlinks using regular expressions, and writes the results to a structured JSON file.


1. Project Specifications

  • Target: Download page markup from a specified URL.
  • Extraction: Extract all unique anchor tags and absolute links.
  • Output: Save the results into a file named links.json.
  • Execution: Run from the command-line by passing the target URL as an argument.

2. Implementing the Crawler

Create a new file named crawler.js and paste the following implementation:

import https from "https";
import fs from "fs/promises";
import url from "url";

// Get command-line arguments (node crawler.js <url>)
const targetUrl = process.argv[2];

if (!targetUrl) {
  console.error("Error: Please provide a target URL.");
  console.error("Usage: node crawler.js https://example.com");
  process.exit(1);
}

// Helper function to fetch HTML content
function fetchHtml(target) {
  return new Promise((resolve, reject) => {
    https.get(target, (res) => {
      if (res.statusCode !== 200) {
        reject(new Error(`Failed to fetch page, status code: ${res.statusCode}`));
        return;
      }

      let rawData = "";
      res.on("data", (chunk) => {
        rawData += chunk.toString();
      });

      res.on("end", () => {
        resolve(rawData);
      });
    }).on("error", (err) => {
      reject(err);
    });
  });
}

// Helper function to extract links using regex
function extractLinks(html, baseUrl) {
  const linkRegex = /href="([^"]+)"/g;
  const uniqueLinks = new Set();
  let match;

  while ((match = linkRegex.exec(html)) !== null) {
    const rawLink = match[1];
    
    try {
      // Resolve relative links to absolute URLs
      const absoluteUrl = new url.URL(rawLink, baseUrl).href;
      uniqueLinks.add(absoluteUrl);
    } catch (e) {
      // Ignore invalid URLs
    }
  }

  return Array.from(uniqueLinks);
}

// Main runner execution
async function main() {
  try {
    console.log(`Fetching page: ${targetUrl}...`);
    const html = await fetchHtml(targetUrl);
    
    console.log("Analyzing page markup...");
    const links = extractLinks(html, targetUrl);
    
    console.log(`Found ${links.length} unique links.`);
    
    const outputData = JSON.stringify({
      url: targetUrl,
      crawledAt: new Date().toISOString(),
      links
    }, null, 2);

    await fs.writeFile("links.json", outputData, "utf8");
    console.log("Results successfully saved to links.json");
  } catch (error) {
    console.error("Crawler failed:", error.message);
    process.exit(1);
  }
}

main();

3. Running the CLI Crawler

To test the crawler, run the script against a public website:

# Run crawler targeting a website
node crawler.js https://www.wikipedia.org

Once execution completes, inspect the generated links.json output:

{
  "url": "https://www.wikipedia.org",
  "crawledAt": "2026-06-16T03:15:00.000Z",
  "links": [
    "https://en.wikipedia.org/",
    "https://ja.wikipedia.org/",
    "https://es.wikipedia.org/"
  ]
}
Published on Last updated: