Building Production RAG Systems with TypeScript and pgvector

February 10, 202616 min read

Retrieval-Augmented Generation (RAG) lets LLMs answer questions from your own data. Build a complete pipeline: chunking, embedding, vector search, and grounded generation with TypeScript and pgvector.

Building Production RAG Systems with TypeScript and pgvector

Introduction

Large Language Models know a lot, but they don't know your data. Retrieval-Augmented Generation (RAG) bridges that gap: at query time, you retrieve the most relevant chunks from a vector database and inject them into the LLM's context. The LLM answers from your data, not from its training.

This guide builds a complete, production-ready RAG pipeline using TypeScript, OpenAI embeddings, and pgvector — all in Postgres.

Architecture

Documents → Chunk → Embed → Store in pgvector
User query → Embed → Similarity search → Top-K chunks → LLM → Answer

Step 1: Enable pgvector in Postgres

CREATE EXTENSION IF NOT EXISTS vector;

CREATE TABLE documents (
  id BIGSERIAL PRIMARY KEY,
  source TEXT NOT NULL,          -- filename or URL
  chunk_index INTEGER NOT NULL,  -- position in source
  content TEXT NOT NULL,         -- the raw text chunk
  embedding VECTOR(1536),        -- OpenAI text-embedding-3-small
  metadata JSONB DEFAULT '{}'
);

-- IVFFlat index for approximate nearest neighbour search
CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);

Step 2: Chunking Strategy

Chunking has a huge impact on retrieval quality. The goal: chunks small enough to be precise, large enough to be meaningful.

// lib/chunker.ts
interface Chunk {
  content: string;
  chunkIndex: number;
  source: string;
}

export function chunkText(text: string, source: string, options = {
  chunkSize: 512,
  overlap: 64,
}): Chunk[] {
  const { chunkSize, overlap } = options;
  const chunks: Chunk[] = [];
  let start = 0;
  let index = 0;

  while (start < text.length) {
    const end = Math.min(start + chunkSize, text.length);
    let chunkEnd = end;

    // Try to break at sentence boundary
    if (end < text.length) {
      const sentenceEnd = text.lastIndexOf('.', end);
      if (sentenceEnd > start + chunkSize / 2) {
        chunkEnd = sentenceEnd + 1;
      }
    }

    chunks.push({
      content: text.slice(start, chunkEnd).trim(),
      chunkIndex: index++,
      source,
    });

    start = chunkEnd - overlap;
  }

  return chunks;
}

Step 3: Embedding and Storing

// lib/embed.ts
import OpenAI from 'openai';
import { db } from './db';

const openai = new OpenAI();

export async function embedAndStore(chunks: Chunk[]) {
  // Batch embedding — up to 2048 inputs per call
  const BATCH_SIZE = 100;

  for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
    const batch = chunks.slice(i, i + BATCH_SIZE);

    const response = await openai.embeddings.create({
      model: 'text-embedding-3-small',
      input: batch.map(c => c.content),
    });

    const rows = batch.map((chunk, j) => ({
      source: chunk.source,
      chunk_index: chunk.chunkIndex,
      content: chunk.content,
      embedding: JSON.stringify(response.data[j].embedding),
    }));

    await db.query(
      `INSERT INTO documents (source, chunk_index, content, embedding)
       SELECT * FROM UNNEST($1::text[], $2::int[], $3::text[], $4::vector[])`,
      [
        rows.map(r => r.source),
        rows.map(r => r.chunk_index),
        rows.map(r => r.content),
        rows.map(r => r.embedding),
      ]
    );

    console.log(`Stored ${i + batch.length}/${chunks.length} chunks`);
  }
}

Step 4: Retrieval

// lib/retrieve.ts
export async function retrieveContext(query: string, topK = 5): Promise<string[]> {
  // Embed the query
  const response = await openai.embeddings.create({
    model: 'text-embedding-3-small',
    input: query,
  });
  const queryEmbedding = JSON.stringify(response.data[0].embedding);

  // Cosine similarity search
  const result = await db.query<{ content: string; similarity: number }>(
    `SELECT content, 1 - (embedding <=> $1::vector) AS similarity
     FROM documents
     ORDER BY embedding <=> $1::vector
     LIMIT $2`,
    [queryEmbedding, topK]
  );

  return result.rows
    .filter(r => r.similarity > 0.75) // Quality threshold
    .map(r => r.content);
}

Step 5: Grounded Generation

// lib/rag.ts
export async function ragQuery(userQuestion: string): Promise<string> {
  const contextChunks = await retrieveContext(userQuestion);

  if (!contextChunks.length) {
    return "I don't have relevant information to answer that question from your documents.";
  }

  const context = contextChunks.join('

---

');

  const response = await openai.chat.completions.create({
    model: 'gpt-4o-mini',
    messages: [
      {
        role: 'system',
        content: `You are a helpful assistant. Answer the user's question using ONLY the context below. 
If the context doesn't contain the answer, say so clearly.
Do not make up information.

Context:
${context}`,
      },
      { role: 'user', content: userQuestion },
    ],
    temperature: 0.1,
  });

  return response.choices[0].message.content ?? '';
}

Improving Retrieval Quality

Hybrid Search (vector + full-text)

-- Combine vector similarity with keyword search using RRF
WITH vector_results AS (
  SELECT id, content, 1 - (embedding <=> $1::vector) AS score,
         ROW_NUMBER() OVER (ORDER BY embedding <=> $1::vector) AS rank
  FROM documents
  LIMIT 20
),
keyword_results AS (
  SELECT id, content, ts_rank(to_tsvector('english', content), query) AS score,
         ROW_NUMBER() OVER (ORDER BY score DESC) AS rank
  FROM documents, to_tsquery('english', $2) query
  WHERE to_tsvector('english', content) @@ query
  LIMIT 20
)
SELECT COALESCE(v.id, k.id) AS id,
       COALESCE(v.content, k.content) AS content,
       (COALESCE(1.0 / (60 + v.rank), 0) + COALESCE(1.0 / (60 + k.rank), 0)) AS rrf_score
FROM vector_results v
FULL OUTER JOIN keyword_results k ON v.id = k.id
ORDER BY rrf_score DESC
LIMIT 5;

Conclusion

RAG is currently the most practical way to build LLM applications grounded in your own data. pgvector keeps everything in Postgres, eliminating a separate vector database. Focus your tuning effort on chunking strategy and retrieval quality before touching the LLM.

Key takeaways:

Chunk size: 256–512 tokens with 10–15% overlap works for most text
Always filter by similarity threshold — don't pass low-quality context to the LLM
Hybrid search (vector + BM25) beats pure vector search on most benchmarks
Use text-embedding-3-small — it's 5x cheaper than large and nearly as good