Building Production RAG Systems with TypeScript and pgvector
Introduction
Large Language Models know a lot, but they don't know your data. Retrieval-Augmented Generation (RAG) bridges that gap: at query time, you retrieve the most relevant chunks from a vector database and inject them into the LLM's context. The LLM answers from your data, not from its training.
This guide builds a complete, production-ready RAG pipeline using TypeScript, OpenAI embeddings, and pgvector — all in Postgres.
Architecture
Documents → Chunk → Embed → Store in pgvector
User query → Embed → Similarity search → Top-K chunks → LLM → Answer
Step 1: Enable pgvector in Postgres
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE documents (
id BIGSERIAL PRIMARY KEY,
source TEXT NOT NULL, -- filename or URL
chunk_index INTEGER NOT NULL, -- position in source
content TEXT NOT NULL, -- the raw text chunk
embedding VECTOR(1536), -- OpenAI text-embedding-3-small
metadata JSONB DEFAULT '{}'
);
-- IVFFlat index for approximate nearest neighbour search
CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
Step 2: Chunking Strategy
Chunking has a huge impact on retrieval quality. The goal: chunks small enough to be precise, large enough to be meaningful.
// lib/chunker.ts
interface Chunk {
content: string;
chunkIndex: number;
source: string;
}
export function chunkText(text: string, source: string, options = {
chunkSize: 512,
overlap: 64,
}): Chunk[] {
const { chunkSize, overlap } = options;
const chunks: Chunk[] = [];
let start = 0;
let index = 0;
while (start < text.length) {
const end = Math.min(start + chunkSize, text.length);
let chunkEnd = end;
// Try to break at sentence boundary
if (end < text.length) {
const sentenceEnd = text.lastIndexOf('.', end);
if (sentenceEnd > start + chunkSize / 2) {
chunkEnd = sentenceEnd + 1;
}
}
chunks.push({
content: text.slice(start, chunkEnd).trim(),
chunkIndex: index++,
source,
});
start = chunkEnd - overlap;
}
return chunks;
}
Step 3: Embedding and Storing
// lib/embed.ts
import OpenAI from 'openai';
import { db } from './db';
const openai = new OpenAI();
export async function embedAndStore(chunks: Chunk[]) {
// Batch embedding — up to 2048 inputs per call
const BATCH_SIZE = 100;
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
const batch = chunks.slice(i, i + BATCH_SIZE);
const response = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: batch.map(c => c.content),
});
const rows = batch.map((chunk, j) => ({
source: chunk.source,
chunk_index: chunk.chunkIndex,
content: chunk.content,
embedding: JSON.stringify(response.data[j].embedding),
}));
await db.query(
`INSERT INTO documents (source, chunk_index, content, embedding)
SELECT * FROM UNNEST($1::text[], $2::int[], $3::text[], $4::vector[])`,
[
rows.map(r => r.source),
rows.map(r => r.chunk_index),
rows.map(r => r.content),
rows.map(r => r.embedding),
]
);
console.log(`Stored ${i + batch.length}/${chunks.length} chunks`);
}
}
Step 4: Retrieval
// lib/retrieve.ts
export async function retrieveContext(query: string, topK = 5): Promise<string[]> {
// Embed the query
const response = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: query,
});
const queryEmbedding = JSON.stringify(response.data[0].embedding);
// Cosine similarity search
const result = await db.query<{ content: string; similarity: number }>(
`SELECT content, 1 - (embedding <=> $1::vector) AS similarity
FROM documents
ORDER BY embedding <=> $1::vector
LIMIT $2`,
[queryEmbedding, topK]
);
return result.rows
.filter(r => r.similarity > 0.75) // Quality threshold
.map(r => r.content);
}
Step 5: Grounded Generation
// lib/rag.ts
export async function ragQuery(userQuestion: string): Promise<string> {
const contextChunks = await retrieveContext(userQuestion);
if (!contextChunks.length) {
return "I don't have relevant information to answer that question from your documents.";
}
const context = contextChunks.join('
---
');
const response = await openai.chat.completions.create({
model: 'gpt-4o-mini',
messages: [
{
role: 'system',
content: `You are a helpful assistant. Answer the user's question using ONLY the context below.
If the context doesn't contain the answer, say so clearly.
Do not make up information.
Context:
${context}`,
},
{ role: 'user', content: userQuestion },
],
temperature: 0.1,
});
return response.choices[0].message.content ?? '';
}
Improving Retrieval Quality
Hybrid Search (vector + full-text)
-- Combine vector similarity with keyword search using RRF
WITH vector_results AS (
SELECT id, content, 1 - (embedding <=> $1::vector) AS score,
ROW_NUMBER() OVER (ORDER BY embedding <=> $1::vector) AS rank
FROM documents
LIMIT 20
),
keyword_results AS (
SELECT id, content, ts_rank(to_tsvector('english', content), query) AS score,
ROW_NUMBER() OVER (ORDER BY score DESC) AS rank
FROM documents, to_tsquery('english', $2) query
WHERE to_tsvector('english', content) @@ query
LIMIT 20
)
SELECT COALESCE(v.id, k.id) AS id,
COALESCE(v.content, k.content) AS content,
(COALESCE(1.0 / (60 + v.rank), 0) + COALESCE(1.0 / (60 + k.rank), 0)) AS rrf_score
FROM vector_results v
FULL OUTER JOIN keyword_results k ON v.id = k.id
ORDER BY rrf_score DESC
LIMIT 5;
Conclusion
RAG is currently the most practical way to build LLM applications grounded in your own data. pgvector keeps everything in Postgres, eliminating a separate vector database. Focus your tuning effort on chunking strategy and retrieval quality before touching the LLM.
Key takeaways:
- Chunk size: 256–512 tokens with 10–15% overlap works for most text
- Always filter by similarity threshold — don't pass low-quality context to the LLM
- Hybrid search (vector + BM25) beats pure vector search on most benchmarks
- Use
text-embedding-3-small— it's 5x cheaper than large and nearly as good