Project: Local Document Vector Search

In this project, we will build a completely offline RAG (Retrieval-Augmented Generation) script. The application reads local knowledge documents, converts texts into vector embeddings using Ollama, performs semantic searches, and feeds context into a local LLM.

1. RAG Core Pipeline

graph TD
    A[Raw local document text] --> B[Generate Embeddings via Ollama API]
    B --> C[Compute cosine similarity of query]
    C --> D[Retrieve top matching text chunks]
    D --> E[Pass retrieved context to Qwen model]

2. Implementing the RAG Engine

Write the helper to compute vectors and search document chunks locally:

// src/services/localRag.ts
import ollama from "@ollama/ollama";

interface DocumentChunk {
  text: string;
  embedding: number[];
}

// Simple helper to compute cosine similarity between two vector arrays
function cosineSimilarity(vecA: number[], vecB: number[]): number {
  let dotProduct = 0;
  let normA = 0;
  let normB = 0;
  
  for (let i = 0; i < vecA.length; i++) {
    dotProduct += vecA[i] * vecB[i];
    normA += vecA[i] * vecA[i];
    normB += vecB[i] * vecB[i];
  }
  
  return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}

export async function buildLocalRagSystem(documentChunks: string[], userQuery: string) {
  const modelName = "qwen2.5";
  const database: DocumentChunk[] = [];

  // 1. Convert all text chunks into embeddings
  for (const chunk of documentChunks) {
    const response = await ollama.embeddings({
      model: modelName,
      prompt: chunk,
    });
    
    database.push({
      text: chunk,
      embedding: response.embedding,
    });
  }

  // 2. Generate embedding vector for the user query
  const queryResponse = await ollama.embeddings({
    model: modelName,
    prompt: userQuery,
  });
  const queryVector = queryResponse.embedding;

  // 3. Compute similarities and sort chunks
  const matches = database.map((chunk) => {
    return {
      text: chunk.text,
      score: cosineSimilarity(queryVector, chunk.embedding),
    };
  });

  matches.sort((a, b) => b.score - a.score);
  const bestContextText = matches.slice(0, 2).map((m) => m.text).join("\n\n");

  // 4. Query local model passing retrieved context data
  const finalPrompt = `
    Use the following verified context documents to answer the question:
    ---
    ${bestContextText}
    ---
    Question: ${userQuery}
  `;

  const finalResponse = await ollama.chat({
    model: modelName,
    messages: [{ role: "user", content: finalPrompt }],
  });

  return finalResponse.message.content;
}

3. Testing Local Document Queries

Call the function passing mock document chunks:

const chunks = [
  "Company policy: employees can work from home on Fridays.",
  "Office location: the main office is in Seattle.",
  "Financial calendar: the fiscal year ends on December 31."
];

const answer = await buildLocalRagSystem(chunks, "Can I work remotely on Friday?");
console.log("Local RAG Answer:", answer);

Published on Jun 16, 2026 Last updated: Jun 16, 2026

Getting Started

Popular Models

Http Api Sdks

Practice Project

Resources

Project: Local Document Vector Search

1. RAG Core Pipeline

2. Implementing the RAG Engine

3. Testing Local Document Queries