Loading...

Automating Web Scraping with AI in 2025: A TypeScript Guide

Learn how to build an intelligent web scraping system using TypeScript, Puppeteer, OpenAI, and LangChain. Includes rate limiting, proxy rotation, and error handling.

Automating Web Scraping with AI in 2025: A TypeScript Guide

Automating Web Scraping with AI in 2025

This guide incorporates the latest AI models and web scraping techniques, with a focus on ethical data collection and processing.

Web scraping is essential for data collection, but modern websites are increasingly complex. Let's explore how to use AI to build a robust, intelligent scraping system.

Architecture Overview

We'll build a system that:

  • Uses AI to understand webpage structure
  • Handles dynamic content automatically
  • Adapts to website changes
  • Processes data intelligently
  • Respects robots.txt and rate limits

Setting Up the Environment

First, create a new TypeScript project:

bash

mkdir ai-scraper
cd ai-scraper
npm init -y
npm install puppeteer openai langchain cheerio typescript zod
npm install -D @types/node @types/puppeteer

Configure TypeScript:

json

{
  "compilerOptions": {
    "target": "ES2022",
    "module": "NodeNext",
    "moduleResolution": "NodeNext",
    "outDir": "./dist",
    "rootDir": "./src",
    "strict": true,
    "esModuleInterop": true,
    "skipLibCheck": true,
    "forceConsistentCasingInFileNames": true
  },
  "include": ["src/**/*"],
  "exclude": ["node_modules"]
}

Core Scraping Engine

1. Browser Management

typescript

import puppeteer, { Browser, Page } from "puppeteer";
 
import { ProxyRotator } from "./proxy";
 
export class BrowserManager {
  private browser: Browser | null = null;
  private proxyRotator: ProxyRotator;
 
  constructor(proxyList: string[]) {
    this.proxyRotator = new ProxyRotator(proxyList);
  }
 
  async initialize(): Promise<Browser> {
    const proxy = this.proxyRotator.getNext();
 
    this.browser = await puppeteer.launch({
      headless: "new",
      args: ["--no-sandbox", "--disable-setuid-sandbox", `--proxy-server=${proxy}`],
    });
 
    return this.browser;
  }
 
  async newPage(): Promise<Page> {
    if (!this.browser) {
      throw new Error("Browser not initialized");
    }
 
    const page = await this.browser.newPage();
 
    await page.setUserAgent(
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
        "AppleWebKit/537.36 (KHTML, like Gecko) " +
        "Chrome/120.0.0.0 Safari/537.36",
    );
 
    await page.evaluateOnNewDocument(() => {
      Object.defineProperty(navigator, "webdriver", {
        get: () => false,
      });
    });
 
    return page;
  }
 
  async close() {
    if (this.browser) {
      await this.browser.close();
      this.browser = null;
    }
  }
}

2. AI-Powered Selector Generation

typescript

import { OpenAI } from "langchain/llms/openai";
import { PromptTemplate } from "langchain/prompts";
 
export class AISelectorGenerator {
  private model: OpenAI;
 
  constructor(apiKey: string) {
    this.model = new OpenAI({
      openAIApiKey: apiKey,
      temperature: 0.2,
      modelName: "gpt-4-turbo-preview",
    });
  }
 
  async generateSelectors(html: string, target: string): Promise<string[]> {
    const prompt = PromptTemplate.fromTemplate(`
      Analyze this HTML and generate robust CSS selectors for {target}.
      Consider:
      1. Unique attributes
      2. Structural patterns
      3. Semantic meaning
 
      HTML: {html}
 
      Return only the selectors, one per line.
    `);
 
    const response = await this.model.call(
      await prompt.format({
        html: html.substring(0, 4000),
        target,
      }),
    );
 
    return response.split("\n").filter(Boolean);
  }
 
  async validateSelector(page: Page, selector: string): Promise<boolean> {
    try {
      await page.waitForSelector(selector, { timeout: 5000 });
      return true;
    } catch {
      return false;
    }
  }
}

3. Data Extraction and Processing

typescript

import { CheerioAPI, load } from "cheerio";
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { z } from "zod";
 
const ProductSchema = z.object({
  name: z.string(),
  price: z.number(),
  description: z.string(),
  features: z.array(z.string()),
  images: z.array(z.string().url()),
});
 
export class DataExtractor {
  private embeddings: OpenAIEmbeddings;
 
  constructor(apiKey: string) {
    this.embeddings = new OpenAIEmbeddings({
      openAIApiKey: apiKey,
    });
  }
 
  async extractStructured(html: string, schema: z.ZodType): Promise<unknown> {
    const $ = load(html);
 
    const structuredData = await this.parseWithAI($, schema);
 
    return schema.parse(structuredData);
  }
 
  async extractUnstructured(html: string): Promise<string[]> {
    const $ = load(html);
 
    $("script, style, nav, footer").remove();
 
    const text = $.text();
 
    const splitter = new RecursiveCharacterTextSplitter({
      chunkSize: 1000,
      chunkOverlap: 200,
    });
 
    return splitter.splitText(text);
  }
 
  private async parseWithAI($: CheerioAPI, schema: z.ZodType): Promise<unknown> {
    // Implementation details...
  }
}

4. Rate Limiting and Respect

typescript

import { Redis } from "ioredis";
import { parse } from "robots-txt-parser";
 
export class RateLimiter {
  private redis: Redis;
  private robotsTxt: Map<string, any> = new Map();
 
  constructor(redisUrl: string) {
    this.redis = new Redis(redisUrl);
  }
 
  async canAccess(url: string): Promise<boolean> {
    const domain = new URL(url).hostname;
 
    if (!this.robotsTxt.has(domain)) {
      const robotsTxt = await this.fetchRobotsTxt(domain);
      this.robotsTxt.set(domain, robotsTxt);
    }
 
    const rules = this.robotsTxt.get(domain);
    if (!rules.isAllowed(url)) {
      return false;
    }
 
    const key = `ratelimit:${domain}`;
    const current = await this.redis.incr(key);
 
    if (current === 1) {
      await this.redis.expire(key, 60);
    }
 
    const limit = rules.getCrawlDelay() || 60;
    return current <= limit;
  }
 
  private async fetchRobotsTxt(domain: string) {
    // Implementation details...
  }
}

Intelligent Scraping Pipeline

1. Task Definition

typescript

import { z } from "zod";
 
export const TaskSchema = z.object({
  url: z.string().url(),
  selectors: z.array(z.string()).optional(),
  schema: z.ZodType,
  maxRetries: z.number().default(3),
  priority: z.number().default(1),
});
 
export type Task = z.infer<typeof TaskSchema>;
 
export class TaskQueue {
  private queue: Task[] = [];
 
  add(task: Task) {
    this.queue.push(task);
    this.queue.sort((a, b) => b.priority - a.priority);
  }
 
  next(): Task | undefined {
    return this.queue.shift();
  }
 
  isEmpty(): boolean {
    return this.queue.length === 0;
  }
}

2. Main Scraper Class

typescript

import { AISelectorGenerator } from "./ai-selector";
import { BrowserManager } from "./browser";
import { DataExtractor } from "./extractor";
import { RateLimiter } from "./rate-limiter";
import { Task, TaskQueue } from "./tasks";
 
export class AIScraper {
  private browser: BrowserManager;
  private selector: AISelectorGenerator;
  private extractor: DataExtractor;
  private rateLimiter: RateLimiter;
  private taskQueue: TaskQueue;
 
  constructor(config: ScraperConfig) {
    this.browser = new BrowserManager(config.proxies);
    this.selector = new AISelectorGenerator(config.openAIKey);
    this.extractor = new DataExtractor(config.openAIKey);
    this.rateLimiter = new RateLimiter(config.redisUrl);
    this.taskQueue = new TaskQueue();
  }
 
  async start() {
    await this.browser.initialize();
 
    while (!this.taskQueue.isEmpty()) {
      const task = this.taskQueue.next();
      if (!task) continue;
 
      await this.processTask(task);
    }
 
    await this.browser.close();
  }
 
  private async processTask(task: Task) {
    if (!(await this.rateLimiter.canAccess(task.url))) {
      this.taskQueue.add({ ...task, priority: task.priority - 1 });
      return;
    }
 
    const page = await this.browser.newPage();
 
    try {
      await page.goto(task.url, {
        waitUntil: "networkidle0",
      });
 
      const selectors =
        task.selectors ??
        (await this.selector.generateSelectors(await page.content(), task.schema.description));
 
      const html = await page.content();
      const data = await this.extractor.extractStructured(html, task.schema);
 
      await this.processResults(data);
    } catch (error) {
      if (task.maxRetries > 0) {
        this.taskQueue.add({
          ...task,
          maxRetries: task.maxRetries - 1,
        });
      }
      console.error(`Failed to process ${task.url}:`, error);
    } finally {
      await page.close();
    }
  }
 
  private async processResults(data: unknown) {
    // Implementation details...
  }
}

Usage Example

typescript

import { ProductSchema } from "./extractor";
import { AIScraper } from "./scraper";
 
async function main() {
  const scraper = new AIScraper({
    openAIKey: process.env.OPENAI_API_KEY!,
    redisUrl: process.env.REDIS_URL!,
    proxies: ["http://proxy1.example.com", "http://proxy2.example.com"],
  });
 
  scraper.addTask({
    url: "https://example.com/products",
    schema: ProductSchema,
    priority: 2,
  });
 
  await scraper.start();
}
 
main().catch(console.error);

Error Handling and Monitoring

1. Error Tracking

typescript

import * as Sentry from "@sentry/node";
 
export class ErrorTracker {
  constructor(dsn: string) {
    Sentry.init({
      dsn,
      tracesSampleRate: 1.0,
      integrations: [new Sentry.Integrations.Http({ tracing: true })],
    });
  }
 
  captureError(error: Error, context: Record<string, any>) {
    Sentry.withScope(scope => {
      scope.setExtras(context);
      Sentry.captureException(error);
    });
  }
 
  async flush(): Promise<boolean> {
    return Sentry.flush(2000);
  }
}

2. Performance Monitoring

typescript

import { Counter, Gauge } from "prom-client";
 
export class MetricsCollector {
  private taskGauge: Gauge;
  private successCounter: Counter;
  private failureCounter: Counter;
 
  constructor() {
    this.taskGauge = new Gauge({
      name: "scraper_tasks_remaining",
      help: "Number of tasks remaining in queue",
    });
 
    this.successCounter = new Counter({
      name: "scraper_successful_scrapes",
      help: "Number of successful scrapes",
    });
 
    this.failureCounter = new Counter({
      name: "scraper_failed_scrapes",
      help: "Number of failed scrapes",
      labelNames: ["reason"],
    });
  }
 
  updateTaskCount(count: number) {
    this.taskGauge.set(count);
  }
 
  recordSuccess() {
    this.successCounter.inc();
  }
 
  recordFailure(reason: string) {
    this.failureCounter.inc({ reason });
  }
}

Advanced Features

1. Content Change Detection

typescript

import { createHash } from "crypto";
import { Redis } from "ioredis";
 
export class ChangeDetector {
  private redis: Redis;
 
  constructor(redisUrl: string) {
    this.redis = new Redis(redisUrl);
  }
 
  async hasChanged(url: string, content: string): Promise<boolean> {
    const hash = this.hashContent(content);
    const key = `content:${url}`;
 
    const oldHash = await this.redis.get(key);
    await this.redis.set(key, hash);
 
    return oldHash !== hash;
  }
 
  private hashContent(content: string): string {
    return createHash("sha256").update(content).digest("hex");
  }
}

2. Smart Scheduling

typescript

import schedule from "node-schedule";
 
import { TaskQueue } from "./tasks";
 
export class SmartScheduler {
  private queue: TaskQueue;
 
  constructor(queue: TaskQueue) {
    this.queue = queue;
  }
 
  scheduleTask(cronExpression: string, taskGenerator: () => Task) {
    schedule.scheduleJob(cronExpression, () => {
      this.queue.add(taskGenerator());
    });
  }
 
  scheduleAdaptive(url: string, changeDetector: ChangeDetector) {
    // Implementation details...
  }
}

Best Practices

  1. Respect Websites

    • Follow robots.txt
    • Implement rate limiting
    • Use meaningful user agents
  2. Handle Errors Gracefully

    • Implement retries with backoff
    • Log failures for analysis
    • Monitor system health
  3. Optimize Performance

    • Use connection pooling
    • Implement caching
    • Batch requests when possible
  4. Maintain Data Quality

    • Validate extracted data
    • Handle missing fields
    • Version your schemas

Resources

Always review websites' terms of service and robots.txt before scraping. Consider using official APIs when available.