From ffb035b0fb84218bf05b3bb0c468352bc7feeff6 Mon Sep 17 00:00:00 2001 From: Efaz Mohammed <44260523+WhiteHades@users.noreply.github.com> Date: Tue, 21 Oct 2025 13:57:49 +0200 Subject: [PATCH 1/3] feat: add token_count field to schema --- src/gitingest/schemas/filesystem.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index cc66e7b1..7f5d716e 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -48,6 +48,7 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes file_count: int = 0 dir_count: int = 0 depth: int = 0 + token_count: int = 0 children: list[FileSystemNode] = field(default_factory=list) def sort_children(self) -> None: From cc5e5c93ee6d26ea91c7dceed92cba697ead6e02 Mon Sep 17 00:00:00 2001 From: Efaz Mohammed <44260523+WhiteHades@users.noreply.github.com> Date: Tue, 21 Oct 2025 16:18:11 +0200 Subject: [PATCH 2/3] feat: add token counting utils with caching --- src/gitingest/output_formatter.py | 99 +++++++++++++++++++++++++++---- 1 file changed, 86 insertions(+), 13 deletions(-) diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 5c2b59ae..f9eed475 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -3,7 +3,7 @@ from __future__ import annotations import ssl -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import requests.exceptions import tiktoken @@ -23,6 +23,66 @@ (1_000, "k"), ] +# cache tiktoken encoding for performance +_TIKTOKEN_ENCODING: Any | None = None + +def _get_tiktoken_encoding() -> Any: + """Get cached tiktoken encoding, initializing only once.""" + global _TIKTOKEN_ENCODING + if _TIKTOKEN_ENCODING is None: + _TIKTOKEN_ENCODING = tiktoken.get_encoding("o200k_base") + return _TIKTOKEN_ENCODING + + +def _estimate_tokens(text: str) -> int: + """Estimate token count for a given text. + + Parameters + ---------- + text : str + The text string for which the token count is to be estimated. + + Returns + ------- + int + The number of tokens, or 0 if an error occurs. + + """ + if not text: + return 0 + try: + encoding = _get_tiktoken_encoding() + return len(encoding.encode(text, disallowed_special=())) + except (ValueError, UnicodeEncodeError) as exc: + logger.warning("Failed to estimate token size", extra={"error": str(exc)}) + return 0 + except (requests.exceptions.RequestException, ssl.SSLError) as exc: + # if network errors, skip token count estimation instead of erroring out + logger.warning("Failed to download tiktoken model", extra={"error": str(exc)}) + return 0 + + +def _format_token_number(count: int) -> str: + """Return a human-readable token-count string (e.g. 1.2k, 1.2M). + + Parameters + ---------- + count : int + The token count to format. + + Returns + ------- + str + The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or empty string if count is 0. + + """ + if count == 0: + return "" + for threshold, suffix in _TOKEN_THRESHOLDS: + if count >= threshold: + return f"{count / threshold:.1f}{suffix}" + return str(count) + def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, str]: """Generate a summary, directory structure, and file contents for a given file system node. @@ -51,9 +111,17 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, summary += f"File: {node.name}\n" summary += f"Lines: {len(node.content.splitlines()):,}\n" + content = _gather_file_contents(node) + tree = "Directory structure:\n" + _create_tree_structure(query, node=node) - content = _gather_file_contents(node) + # calculate total tokens for entire digest (tree + content) - what users download/copy + total_tokens = _estimate_tokens(tree + content) + + # set root node token count to match the total exactly + node.token_count = total_tokens + + tree = "Directory structure:\n" + _create_tree_structure(query, node=node) token_estimate = _format_token_count(tree + content) if token_estimate: @@ -107,6 +175,7 @@ def _gather_file_contents(node: FileSystemNode) -> str: This function recursively processes a directory node and gathers the contents of all files under that node. It returns the concatenated content of all files as a single string. + Also calculates and aggregates token counts during traversal. Parameters ---------- @@ -120,10 +189,17 @@ def _gather_file_contents(node: FileSystemNode) -> str: """ if node.type != FileSystemNodeType.DIRECTORY: + node.token_count = _estimate_tokens(node.content) return node.content_string - # Recursively gather contents of all files under the current directory - return "\n".join(_gather_file_contents(child) for child in node.children) + # recursively gather contents and aggregate token counts + node.token_count = 0 + contents = [] + for child in node.children: + contents.append(_gather_file_contents(child)) + node.token_count += child.token_count + + return "\n".join(contents) def _create_tree_structure( @@ -169,6 +245,10 @@ def _create_tree_structure( elif node.type == FileSystemNodeType.SYMLINK: display_name += " -> " + readlink(node.path).name + if node.token_count > 0: + formatted_tokens = _format_token_number(node.token_count) + display_name += f" ({formatted_tokens} tokens)" + tree_str += f"{prefix}{current_prefix}{display_name}\n" if node.type == FileSystemNodeType.DIRECTORY and node.children: @@ -192,15 +272,8 @@ def _format_token_count(text: str) -> str | None: The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs. """ - try: - encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini - total_tokens = len(encoding.encode(text, disallowed_special=())) - except (ValueError, UnicodeEncodeError) as exc: - logger.warning("Failed to estimate token size", extra={"error": str(exc)}) - return None - except (requests.exceptions.RequestException, ssl.SSLError) as exc: - # If network errors, skip token count estimation instead of erroring out - logger.warning("Failed to download tiktoken model", extra={"error": str(exc)}) + total_tokens = _estimate_tokens(text) + if total_tokens == 0: return None for threshold, suffix in _TOKEN_THRESHOLDS: From 043386122eb2360a1bc1b1bb19c531e626d1e292 Mon Sep 17 00:00:00 2001 From: Efaz Mohammed <44260523+WhiteHades@users.noreply.github.com> Date: Tue, 21 Oct 2025 19:33:58 +0200 Subject: [PATCH 3/3] feat: render token in frontend tree --- src/static/js/utils.js | 222 +++++++++++++++++++++++++++-------------- 1 file changed, 147 insertions(+), 75 deletions(-) diff --git a/src/static/js/utils.js b/src/static/js/utils.js index ce19e95e..9dc00719 100644 --- a/src/static/js/utils.js +++ b/src/static/js/utils.js @@ -4,18 +4,22 @@ function getFileName(element) { let prevIndentLevel = null; while (element) { - const line = element.textContent; + const preElement = element.querySelector('pre'); + if (!preElement) { + break; + } + + const line = preElement.textContent; const index = line.search(/[a-zA-Z0-9_.-]/); const indentLevel = index / indentSize; - // Stop when we reach or go above the top-level directory if (indentLevel <= 1) { break; } - // Only include directories that are one level above the previous if (prevIndentLevel === null || indentLevel === prevIndentLevel - 1) { - const fileName = line.substring(index).trim(); + let fileName = line.substring(index).trim(); + fileName = fileName.replace(/\s+\([\d.,]+[kM]?\s+tokens\)$/, ''); path = fileName + path; prevIndentLevel = indentLevel; @@ -29,15 +33,9 @@ function getFileName(element) { function toggleFile(element) { const patternInput = document.getElementById('pattern'); - const patternFiles = patternInput.value ? patternInput.value.split(',').map((item) => item.trim()) : []; - - const directoryContainer = document.getElementById('directory-structure-container'); - const treeLineElements = Array.from(directoryContainer.children).filter((child) => child.tagName === 'PRE'); - - // Skip the first two tree lines (header and repository name) - if (treeLineElements[0] === element || treeLineElements[1] === element) { - return; - } + const patternFiles = patternInput.value + ? patternInput.value.split(',').map((item) => item.trim()) + : []; element.classList.toggle('line-through'); element.classList.toggle('text-gray-500'); @@ -54,38 +52,42 @@ function toggleFile(element) { patternInput.value = patternFiles.join(', '); } -// Copy functionality function copyText(className) { let textToCopy; if (className === 'directory-structure') { - // For directory structure, get the hidden input value - const hiddenInput = document.getElementById('directory-structure-content'); + const hiddenInput = document.getElementById( + 'directory-structure-content' + ); - if (!hiddenInput) {return;} + if (!hiddenInput) { + return; + } textToCopy = hiddenInput.value; } else { - // For other elements, get the textarea value - const textarea = document.querySelector(`.${ className }`); + const textarea = document.querySelector(`.${className}`); - if (!textarea) {return;} + if (!textarea) { + return; + } textToCopy = textarea.value; } - const button = document.querySelector(`button[onclick="copyText('${className}')"]`); + const button = document.querySelector( + `button[onclick="copyText('${className}')"]` + ); - if (!button) {return;} + if (!button) { + return; + } - // Copy text - navigator.clipboard.writeText(textToCopy) + navigator.clipboard + .writeText(textToCopy) .then(() => { - // Store original content const originalContent = button.innerHTML; - // Change button content button.innerHTML = 'Copied!'; - // Reset after 1 second setTimeout(() => { button.innerHTML = originalContent; }, 1000); @@ -101,7 +103,6 @@ function copyText(className) { }); } -// Helper functions for toggling result blocks function showLoading() { document.getElementById('results-loading').style.display = 'block'; document.getElementById('results-section').style.display = 'none'; @@ -121,7 +122,6 @@ function showError(msg) { errorDiv.style.display = 'block'; } -// Helper function to collect form data function collectFormData(form) { const json_data = {}; const inputText = form.querySelector('[name="input_text"]'); @@ -130,28 +130,40 @@ function collectFormData(form) { const patternType = document.getElementById('pattern_type'); const pattern = document.getElementById('pattern'); - if (inputText) {json_data.input_text = inputText.value;} - if (token) {json_data.token = token.value;} - if (hiddenInput) {json_data.max_file_size = hiddenInput.value;} - if (patternType) {json_data.pattern_type = patternType.value;} - if (pattern) {json_data.pattern = pattern.value;} + if (inputText) { + json_data.input_text = inputText.value; + } + if (token) { + json_data.token = token.value; + } + if (hiddenInput) { + json_data.max_file_size = hiddenInput.value; + } + if (patternType) { + json_data.pattern_type = patternType.value; + } + if (pattern) { + json_data.pattern = pattern.value; + } return json_data; } -// Helper function to manage button loading state function setButtonLoadingState(submitButton, isLoading) { if (!isLoading) { submitButton.disabled = false; - submitButton.innerHTML = submitButton.getAttribute('data-original-content') || 'Submit'; + submitButton.innerHTML = + submitButton.getAttribute('data-original-content') || 'Submit'; submitButton.classList.remove('bg-[#ffb14d]'); return; } - // Store original content if not already stored if (!submitButton.getAttribute('data-original-content')) { - submitButton.setAttribute('data-original-content', submitButton.innerHTML); + submitButton.setAttribute( + 'data-original-content', + submitButton.innerHTML + ); } submitButton.disabled = true; @@ -167,44 +179,81 @@ function setButtonLoadingState(submitButton, isLoading) { submitButton.classList.add('bg-[#ffb14d]'); } -// Helper function to handle successful response function handleSuccessfulResponse(data) { - // Show results section showResults(); - // Store the digest_url for download functionality window.currentDigestUrl = data.digest_url; - // Set plain text content for summary, tree, and content document.getElementById('result-summary').value = data.summary || ''; - document.getElementById('directory-structure-content').value = data.tree || ''; + document.getElementById('directory-structure-content').value = + data.tree || ''; document.getElementById('result-content').value = data.content || ''; - // Populate directory structure lines as clickable
elements
const dirPre = document.getElementById('directory-structure-pre');
if (dirPre && data.tree) {
dirPre.innerHTML = '';
- data.tree.split('\n').forEach((line) => {
- const pre = document.createElement('pre');
-
- pre.setAttribute('name', 'tree-line');
- pre.className = 'cursor-pointer hover:line-through hover:text-gray-500';
- pre.textContent = line;
- pre.onclick = function () { toggleFile(this); };
- dirPre.appendChild(pre);
+ const lines = data.tree.split('\n');
+ const tokenRegex = /^(.+?)(\s+\([\d.,]+[kM]?\s+tokens\))$/;
+
+ lines.forEach((line, index) => {
+ const container = document.createElement('div');
+ container.className =
+ 'flex justify-between cursor-pointer hover:line-through hover:text-gray-500';
+ container.setAttribute('name', 'tree-line');
+
+ const match = line.match(tokenRegex);
+ let namePart, tokenPart;
+
+ if (match) {
+ namePart = match[1];
+ tokenPart = match[2].trim();
+ } else {
+ namePart = line;
+ tokenPart = '';
+ }
+
+ const namePre = document.createElement('pre');
+ namePre.textContent = namePart;
+
+ container.appendChild(namePre);
+
+ if (tokenPart) {
+ const tokenSpan = document.createElement('span');
+ tokenSpan.textContent = tokenPart;
+ tokenSpan.style.color = '#6b7280';
+ tokenSpan.style.fontWeight = 'normal';
+ tokenSpan.style.whiteSpace = 'nowrap';
+
+ if (namePart.trim().endsWith('/')) {
+ tokenSpan.style.fontWeight = 'bold';
+ }
+
+ container.appendChild(tokenSpan);
+ }
+
+ if (index >= 2) {
+ container.onclick = function () {
+ toggleFile(this);
+ };
+ }
+
+ dirPre.appendChild(container);
});
}
- // Scroll to results
- document.getElementById('results-section').scrollIntoView({ behavior: 'smooth', block: 'start' });
+ document
+ .getElementById('results-section')
+ .scrollIntoView({ behavior: 'smooth', block: 'start' });
}
function handleSubmit(event, showLoadingSpinner = false) {
event.preventDefault();
const form = event.target || document.getElementById('ingestForm');
- if (!form) {return;}
+ if (!form) {
+ return;
+ }
// Ensure hidden input is updated before collecting form data
const slider = document.getElementById('file_size');
@@ -220,7 +269,9 @@ function handleSubmit(event, showLoadingSpinner = false) {
const submitButton = form.querySelector('button[type="submit"]');
- if (!submitButton) {return;}
+ if (!submitButton) {
+ return;
+ }
const json_data = collectFormData(form);
@@ -232,7 +283,7 @@ function handleSubmit(event, showLoadingSpinner = false) {
fetch('/api/ingest', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
- body: JSON.stringify(json_data)
+ body: JSON.stringify(json_data),
})
.then(async (response) => {
let data;
@@ -247,21 +298,33 @@ function handleSubmit(event, showLoadingSpinner = false) {
if (!response.ok) {
// Show all error details if present
if (Array.isArray(data.detail)) {
- const details = data.detail.map((d) => `