Back to roadmaps pinecone Course

Project: PDF Semantic Indexing Pipeline

In this project, we will build an automated semantic indexing pipeline. The system reads text content, breaks it into overlapping paragraphs (chunking), calls an embedding model to generate vectors, and saves the vectors to Pinecone.


1. Project Workflow

graph TD
    A[Raw Text Source] --> B[Text Chunking with Overlap]
    B --> C[Generate Vector via OpenAI API]
    C --> D[Structure Upsert Payload]
    D --> E[Batch Upload to Pinecone]

2. Implementing the Ingestion Script

Here is the Node.js implementation:

// src/services/ingestionPipeline.ts
import { pc } from "../lib/pinecone";
import { OpenAI } from "openai";

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
const index = pc.index("knowledge-base");

interface TextChunk {
  id: string;
  text: string;
  pageNumber: number;
}

// 1. Splitting text into chunks with 20% overlap
export function splitTextIntoChunks(text: string, chunkSize = 500, overlap = 100): string[] {
  const words = text.split(" ");
  const chunks: string[] = [];
  let i = 0;

  while (i < words.length) {
    const chunkWords = words.slice(i, i + chunkSize);
    chunks.push(chunkWords.join(" "));
    i += chunkSize - overlap;
  }

  return chunks;
}

// 2. Main Ingestion Pipeline
export async function runIngestionPipeline(rawDocumentText: string, documentTitle: string) {
  try {
    console.log("Starting document ingestion pipeline...");
    const rawChunks = splitTextIntoChunks(rawDocumentText);
    
    const upsertBatch: any[] = [];

    for (let indexVal = 0; indexVal < rawChunks.length; indexVal++) {
      const textChunk = rawChunks[indexVal];
      const chunkId = `${documentTitle.replace(/\s+/g, "_")}_chunk_${indexVal}`;

      // A. Call OpenAI Embeddings API
      const response = await openai.embeddings.create({
        model: "text-embedding-3-small",
        input: textChunk,
      });

      const embeddingVector = response.data[0].embedding;

      // B. Push to upload batch
      upsertBatch.push({
        id: chunkId,
        values: embeddingVector,
        metadata: {
          title: documentTitle,
          text: textChunk,
          chunk_index: indexVal,
        },
      });
    }

    // C. Upload batch in chunks of 100 to avoid request body limits
    const batchSize = 100;
    for (let i = 0; i < upsertBatch.length; i += batchSize) {
      const chunk = upsertBatch.slice(i, i + batchSize);
      await index.upsert(chunk);
      console.log(`Uploaded batch ${i / batchSize + 1} of ${Math.ceil(upsertBatch.length / batchSize)}`);
    }

    console.log("Ingestion successfully completed!");
  } catch (err: any) {
    console.error("Ingestion failed:", err.message);
  }
}
Published on Last updated: