Merge 11c02404e4edd818c1d3b181b3f55412279342c4 into 8a05c05596ca9ba0735dafd8e359885de4c2c969
This commit is contained in:
commit
2ec94df9f1
@ -5,6 +5,72 @@ import { cosineSimilarity, parseEmbedding } from "./internal.js";
|
||||
const vectorToBlob = (embedding: number[]): Buffer =>
|
||||
Buffer.from(new Float32Array(embedding).buffer);
|
||||
|
||||
/**
|
||||
* Extract a relevant snippet window around the query match in the text.
|
||||
* If the query is found, returns a window centered on the match.
|
||||
* Otherwise falls back to the beginning of the text.
|
||||
*/
|
||||
function extractRelevantSnippet(
|
||||
text: string,
|
||||
query: string,
|
||||
maxChars: number,
|
||||
): { snippet: string; offsetLines: number } {
|
||||
if (text.length <= maxChars) {
|
||||
return { snippet: text, offsetLines: 0 };
|
||||
}
|
||||
|
||||
// Try to find the query (case-insensitive) in the text
|
||||
const lowerText = text.toLowerCase();
|
||||
const queryTerms = query
|
||||
.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.filter((term) => term.length > 2);
|
||||
|
||||
let matchIndex = -1;
|
||||
|
||||
// Find the first matching term
|
||||
for (const term of queryTerms) {
|
||||
const idx = lowerText.indexOf(term);
|
||||
if (idx !== -1) {
|
||||
matchIndex = idx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If no match found, fall back to beginning
|
||||
if (matchIndex === -1) {
|
||||
return { snippet: truncateUtf16Safe(text, maxChars), offsetLines: 0 };
|
||||
}
|
||||
|
||||
// Calculate window start, trying to center the match
|
||||
const halfWindow = Math.floor(maxChars / 2);
|
||||
let windowStart = Math.max(0, matchIndex - halfWindow);
|
||||
let windowEnd = Math.min(text.length, windowStart + maxChars);
|
||||
|
||||
// Adjust if we're near the end
|
||||
if (windowEnd === text.length && windowEnd - windowStart < maxChars) {
|
||||
windowStart = Math.max(0, windowEnd - maxChars);
|
||||
}
|
||||
|
||||
// Try to start at a line boundary for cleaner output
|
||||
if (windowStart > 0) {
|
||||
const lineStart = text.lastIndexOf("\n", windowStart);
|
||||
if (lineStart !== -1 && windowStart - lineStart < 100) {
|
||||
windowStart = lineStart + 1;
|
||||
// Recalculate windowEnd to maintain maxChars length after snap
|
||||
windowEnd = Math.min(text.length, windowStart + maxChars);
|
||||
}
|
||||
}
|
||||
|
||||
// Count lines before the window to adjust startLine/endLine display
|
||||
const textBeforeWindow = text.substring(0, windowStart);
|
||||
const offsetLines = (textBeforeWindow.match(/\n/g) || []).length;
|
||||
|
||||
const snippet = text.substring(windowStart, windowEnd);
|
||||
return { snippet: truncateUtf16Safe(snippet, maxChars), offsetLines };
|
||||
}
|
||||
|
||||
|
||||
export type SearchSource = string;
|
||||
|
||||
export type SearchRowResult = {
|
||||
@ -22,6 +88,7 @@ export async function searchVector(params: {
|
||||
vectorTable: string;
|
||||
providerModel: string;
|
||||
queryVec: number[];
|
||||
queryText: string;
|
||||
limit: number;
|
||||
snippetMaxChars: number;
|
||||
ensureVectorReady: (dimensions: number) => Promise<boolean>;
|
||||
@ -57,15 +124,18 @@ export async function searchVector(params: {
|
||||
source: SearchSource;
|
||||
dist: number;
|
||||
}>;
|
||||
return rows.map((row) => ({
|
||||
id: row.id,
|
||||
path: row.path,
|
||||
startLine: row.start_line,
|
||||
endLine: row.end_line,
|
||||
score: 1 - row.dist,
|
||||
snippet: truncateUtf16Safe(row.text, params.snippetMaxChars),
|
||||
source: row.source,
|
||||
}));
|
||||
return rows.map((row) => {
|
||||
const { snippet, offsetLines } = extractRelevantSnippet(row.text, params.queryText, params.snippetMaxChars);
|
||||
return {
|
||||
id: row.id,
|
||||
path: row.path,
|
||||
startLine: row.start_line + offsetLines,
|
||||
endLine: row.end_line,
|
||||
score: 1 - row.dist,
|
||||
snippet,
|
||||
source: row.source,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
const candidates = listChunks({
|
||||
@ -82,15 +152,22 @@ export async function searchVector(params: {
|
||||
return scored
|
||||
.toSorted((a, b) => b.score - a.score)
|
||||
.slice(0, params.limit)
|
||||
.map((entry) => ({
|
||||
id: entry.chunk.id,
|
||||
path: entry.chunk.path,
|
||||
startLine: entry.chunk.startLine,
|
||||
endLine: entry.chunk.endLine,
|
||||
score: entry.score,
|
||||
snippet: truncateUtf16Safe(entry.chunk.text, params.snippetMaxChars),
|
||||
source: entry.chunk.source,
|
||||
}));
|
||||
.map((entry) => {
|
||||
const { snippet, offsetLines } = extractRelevantSnippet(
|
||||
entry.chunk.text,
|
||||
params.queryText,
|
||||
params.snippetMaxChars,
|
||||
);
|
||||
return {
|
||||
id: entry.chunk.id,
|
||||
path: entry.chunk.path,
|
||||
startLine: entry.chunk.startLine + offsetLines,
|
||||
endLine: entry.chunk.endLine,
|
||||
score: entry.score,
|
||||
snippet,
|
||||
source: entry.chunk.source,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
export function listChunks(params: {
|
||||
@ -177,14 +254,15 @@ export async function searchKeyword(params: {
|
||||
|
||||
return rows.map((row) => {
|
||||
const textScore = params.bm25RankToScore(row.rank);
|
||||
const { snippet, offsetLines } = extractRelevantSnippet(row.text, params.query, params.snippetMaxChars);
|
||||
return {
|
||||
id: row.id,
|
||||
path: row.path,
|
||||
startLine: row.start_line,
|
||||
startLine: row.start_line + offsetLines,
|
||||
endLine: row.end_line,
|
||||
score: textScore,
|
||||
textScore,
|
||||
snippet: truncateUtf16Safe(row.text, params.snippetMaxChars),
|
||||
snippet,
|
||||
source: row.source,
|
||||
};
|
||||
});
|
||||
|
||||
@ -345,7 +345,7 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem
|
||||
const queryVec = await this.embedQueryWithTimeout(cleaned);
|
||||
const hasVector = queryVec.some((v) => v !== 0);
|
||||
const vectorResults = hasVector
|
||||
? await this.searchVector(queryVec, candidates).catch(() => [])
|
||||
? await this.searchVector(queryVec, cleaned, candidates).catch(() => [])
|
||||
: [];
|
||||
|
||||
if (!hybrid.enabled || !this.fts.enabled || !this.fts.available) {
|
||||
@ -386,6 +386,7 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem
|
||||
|
||||
private async searchVector(
|
||||
queryVec: number[],
|
||||
queryText: string,
|
||||
limit: number,
|
||||
): Promise<Array<MemorySearchResult & { id: string }>> {
|
||||
// This method should never be called without a provider
|
||||
@ -397,6 +398,7 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem
|
||||
vectorTable: VECTOR_TABLE,
|
||||
providerModel: this.provider.model,
|
||||
queryVec,
|
||||
queryText,
|
||||
limit,
|
||||
snippetMaxChars: SNIPPET_MAX_CHARS,
|
||||
ensureVectorReady: async (dimensions) => await this.ensureVectorReady(dimensions),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user