fix(memory): recalculate window end after snapping to line boundary
When windowStart is moved back to align with a line boundary, windowEnd must be recalculated to maintain the maxChars window size. Without this, the snippet could exceed maxChars and truncateUtf16Safe would trim the tail, potentially removing the matched query term.
This commit is contained in:
parent
4b309e205e
commit
6cb5cae6cf
@ -5,6 +5,72 @@ import { cosineSimilarity, parseEmbedding } from "./internal.js";
|
||||
const vectorToBlob = (embedding: number[]): Buffer =>
|
||||
Buffer.from(new Float32Array(embedding).buffer);
|
||||
|
||||
/**
|
||||
* Extract a relevant snippet window around the query match in the text.
|
||||
* If the query is found, returns a window centered on the match.
|
||||
* Otherwise falls back to the beginning of the text.
|
||||
*/
|
||||
function extractRelevantSnippet(
|
||||
text: string,
|
||||
query: string,
|
||||
maxChars: number,
|
||||
): { snippet: string; offsetLines: number } {
|
||||
if (text.length <= maxChars) {
|
||||
return { snippet: text, offsetLines: 0 };
|
||||
}
|
||||
|
||||
// Try to find the query (case-insensitive) in the text
|
||||
const lowerText = text.toLowerCase();
|
||||
const queryTerms = query
|
||||
.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.filter((term) => term.length > 2);
|
||||
|
||||
let matchIndex = -1;
|
||||
|
||||
// Find the first matching term
|
||||
for (const term of queryTerms) {
|
||||
const idx = lowerText.indexOf(term);
|
||||
if (idx !== -1) {
|
||||
matchIndex = idx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If no match found, fall back to beginning
|
||||
if (matchIndex === -1) {
|
||||
return { snippet: truncateUtf16Safe(text, maxChars), offsetLines: 0 };
|
||||
}
|
||||
|
||||
// Calculate window start, trying to center the match
|
||||
const halfWindow = Math.floor(maxChars / 2);
|
||||
let windowStart = Math.max(0, matchIndex - halfWindow);
|
||||
let windowEnd = Math.min(text.length, windowStart + maxChars);
|
||||
|
||||
// Adjust if we're near the end
|
||||
if (windowEnd === text.length && windowEnd - windowStart < maxChars) {
|
||||
windowStart = Math.max(0, windowEnd - maxChars);
|
||||
}
|
||||
|
||||
// Try to start at a line boundary for cleaner output
|
||||
if (windowStart > 0) {
|
||||
const lineStart = text.lastIndexOf("\n", windowStart);
|
||||
if (lineStart !== -1 && windowStart - lineStart < 100) {
|
||||
windowStart = lineStart + 1;
|
||||
// Recalculate windowEnd to maintain maxChars length after snap
|
||||
windowEnd = Math.min(text.length, windowStart + maxChars);
|
||||
}
|
||||
}
|
||||
|
||||
// Count lines before the window to adjust startLine/endLine display
|
||||
const textBeforeWindow = text.substring(0, windowStart);
|
||||
const offsetLines = (textBeforeWindow.match(/\n/g) || []).length;
|
||||
|
||||
const snippet = text.substring(windowStart, windowEnd);
|
||||
return { snippet: truncateUtf16Safe(snippet, maxChars), offsetLines };
|
||||
}
|
||||
|
||||
|
||||
export type SearchSource = string;
|
||||
|
||||
export type SearchRowResult = {
|
||||
@ -85,17 +151,6 @@ export async function searchVector(params: {
|
||||
return scored
|
||||
.toSorted((a, b) => b.score - a.score)
|
||||
.slice(0, params.limit)
|
||||
<<<<<<< HEAD
|
||||
.map((entry) => ({
|
||||
id: entry.chunk.id,
|
||||
path: entry.chunk.path,
|
||||
startLine: entry.chunk.startLine,
|
||||
endLine: entry.chunk.endLine,
|
||||
score: entry.score,
|
||||
snippet: truncateUtf16Safe(entry.chunk.text, params.snippetMaxChars),
|
||||
source: entry.chunk.source,
|
||||
}));
|
||||
=======
|
||||
.map((entry) => {
|
||||
const { snippet, offsetLines } = extractRelevantSnippet(
|
||||
entry.chunk.text,
|
||||
@ -112,7 +167,6 @@ export async function searchVector(params: {
|
||||
source: entry.chunk.source,
|
||||
};
|
||||
});
|
||||
>>>>>>> 58e5ec73b (fix(memory): use offsetLines to report accurate snippet start positions)
|
||||
}
|
||||
|
||||
export function listChunks(params: {
|
||||
@ -199,10 +253,7 @@ export async function searchKeyword(params: {
|
||||
|
||||
return rows.map((row) => {
|
||||
const textScore = params.bm25RankToScore(row.rank);
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
const { snippet, offsetLines } = extractRelevantSnippet(row.text, params.query, params.snippetMaxChars);
|
||||
>>>>>>> 58e5ec73b (fix(memory): use offsetLines to report accurate snippet start positions)
|
||||
return {
|
||||
id: row.id,
|
||||
path: row.path,
|
||||
@ -210,7 +261,7 @@ export async function searchKeyword(params: {
|
||||
endLine: row.end_line,
|
||||
score: textScore,
|
||||
textScore,
|
||||
snippet: truncateUtf16Safe(row.text, params.snippetMaxChars),
|
||||
snippet,
|
||||
source: row.source,
|
||||
};
|
||||
});
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user