From ffb035b0fb84218bf05b3bb0c468352bc7feeff6 Mon Sep 17 00:00:00 2001 From: Efaz Mohammed <44260523+WhiteHades@users.noreply.github.com> Date: Tue, 21 Oct 2025 13:57:49 +0200 Subject: [PATCH 1/3] feat: add token_count field to schema --- src/gitingest/schemas/filesystem.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index cc66e7b1..7f5d716e 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -48,6 +48,7 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes file_count: int = 0 dir_count: int = 0 depth: int = 0 + token_count: int = 0 children: list[FileSystemNode] = field(default_factory=list) def sort_children(self) -> None: From cc5e5c93ee6d26ea91c7dceed92cba697ead6e02 Mon Sep 17 00:00:00 2001 From: Efaz Mohammed <44260523+WhiteHades@users.noreply.github.com> Date: Tue, 21 Oct 2025 16:18:11 +0200 Subject: [PATCH 2/3] feat: add token counting utils with caching --- src/gitingest/output_formatter.py | 99 +++++++++++++++++++++++++++---- 1 file changed, 86 insertions(+), 13 deletions(-) diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 5c2b59ae..f9eed475 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -3,7 +3,7 @@ from __future__ import annotations import ssl -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import requests.exceptions import tiktoken @@ -23,6 +23,66 @@ (1_000, "k"), ] +# cache tiktoken encoding for performance +_TIKTOKEN_ENCODING: Any | None = None + +def _get_tiktoken_encoding() -> Any: + """Get cached tiktoken encoding, initializing only once.""" + global _TIKTOKEN_ENCODING + if _TIKTOKEN_ENCODING is None: + _TIKTOKEN_ENCODING = tiktoken.get_encoding("o200k_base") + return _TIKTOKEN_ENCODING + + +def _estimate_tokens(text: str) -> int: + """Estimate token count for a given text. + + Parameters + ---------- + text : str + The text string for which the token count is to be estimated. + + Returns + ------- + int + The number of tokens, or 0 if an error occurs. + + """ + if not text: + return 0 + try: + encoding = _get_tiktoken_encoding() + return len(encoding.encode(text, disallowed_special=())) + except (ValueError, UnicodeEncodeError) as exc: + logger.warning("Failed to estimate token size", extra={"error": str(exc)}) + return 0 + except (requests.exceptions.RequestException, ssl.SSLError) as exc: + # if network errors, skip token count estimation instead of erroring out + logger.warning("Failed to download tiktoken model", extra={"error": str(exc)}) + return 0 + + +def _format_token_number(count: int) -> str: + """Return a human-readable token-count string (e.g. 1.2k, 1.2M). + + Parameters + ---------- + count : int + The token count to format. + + Returns + ------- + str + The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or empty string if count is 0. + + """ + if count == 0: + return "" + for threshold, suffix in _TOKEN_THRESHOLDS: + if count >= threshold: + return f"{count / threshold:.1f}{suffix}" + return str(count) + def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, str]: """Generate a summary, directory structure, and file contents for a given file system node. @@ -51,9 +111,17 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, summary += f"File: {node.name}\n" summary += f"Lines: {len(node.content.splitlines()):,}\n" + content = _gather_file_contents(node) + tree = "Directory structure:\n" + _create_tree_structure(query, node=node) - content = _gather_file_contents(node) + # calculate total tokens for entire digest (tree + content) - what users download/copy + total_tokens = _estimate_tokens(tree + content) + + # set root node token count to match the total exactly + node.token_count = total_tokens + + tree = "Directory structure:\n" + _create_tree_structure(query, node=node) token_estimate = _format_token_count(tree + content) if token_estimate: @@ -107,6 +175,7 @@ def _gather_file_contents(node: FileSystemNode) -> str: This function recursively processes a directory node and gathers the contents of all files under that node. It returns the concatenated content of all files as a single string. + Also calculates and aggregates token counts during traversal. Parameters ---------- @@ -120,10 +189,17 @@ def _gather_file_contents(node: FileSystemNode) -> str: """ if node.type != FileSystemNodeType.DIRECTORY: + node.token_count = _estimate_tokens(node.content) return node.content_string - # Recursively gather contents of all files under the current directory - return "\n".join(_gather_file_contents(child) for child in node.children) + # recursively gather contents and aggregate token counts + node.token_count = 0 + contents = [] + for child in node.children: + contents.append(_gather_file_contents(child)) + node.token_count += child.token_count + + return "\n".join(contents) def _create_tree_structure( @@ -169,6 +245,10 @@ def _create_tree_structure( elif node.type == FileSystemNodeType.SYMLINK: display_name += " -> " + readlink(node.path).name + if node.token_count > 0: + formatted_tokens = _format_token_number(node.token_count) + display_name += f" ({formatted_tokens} tokens)" + tree_str += f"{prefix}{current_prefix}{display_name}\n" if node.type == FileSystemNodeType.DIRECTORY and node.children: @@ -192,15 +272,8 @@ def _format_token_count(text: str) -> str | None: The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs. """ - try: - encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini - total_tokens = len(encoding.encode(text, disallowed_special=())) - except (ValueError, UnicodeEncodeError) as exc: - logger.warning("Failed to estimate token size", extra={"error": str(exc)}) - return None - except (requests.exceptions.RequestException, ssl.SSLError) as exc: - # If network errors, skip token count estimation instead of erroring out - logger.warning("Failed to download tiktoken model", extra={"error": str(exc)}) + total_tokens = _estimate_tokens(text) + if total_tokens == 0: return None for threshold, suffix in _TOKEN_THRESHOLDS: From 043386122eb2360a1bc1b1bb19c531e626d1e292 Mon Sep 17 00:00:00 2001 From: Efaz Mohammed <44260523+WhiteHades@users.noreply.github.com> Date: Tue, 21 Oct 2025 19:33:58 +0200 Subject: [PATCH 3/3] feat: render token in frontend tree --- src/static/js/utils.js | 222 +++++++++++++++++++++++++++-------------- 1 file changed, 147 insertions(+), 75 deletions(-) diff --git a/src/static/js/utils.js b/src/static/js/utils.js index ce19e95e..9dc00719 100644 --- a/src/static/js/utils.js +++ b/src/static/js/utils.js @@ -4,18 +4,22 @@ function getFileName(element) { let prevIndentLevel = null; while (element) { - const line = element.textContent; + const preElement = element.querySelector('pre'); + if (!preElement) { + break; + } + + const line = preElement.textContent; const index = line.search(/[a-zA-Z0-9_.-]/); const indentLevel = index / indentSize; - // Stop when we reach or go above the top-level directory if (indentLevel <= 1) { break; } - // Only include directories that are one level above the previous if (prevIndentLevel === null || indentLevel === prevIndentLevel - 1) { - const fileName = line.substring(index).trim(); + let fileName = line.substring(index).trim(); + fileName = fileName.replace(/\s+\([\d.,]+[kM]?\s+tokens\)$/, ''); path = fileName + path; prevIndentLevel = indentLevel; @@ -29,15 +33,9 @@ function getFileName(element) { function toggleFile(element) { const patternInput = document.getElementById('pattern'); - const patternFiles = patternInput.value ? patternInput.value.split(',').map((item) => item.trim()) : []; - - const directoryContainer = document.getElementById('directory-structure-container'); - const treeLineElements = Array.from(directoryContainer.children).filter((child) => child.tagName === 'PRE'); - - // Skip the first two tree lines (header and repository name) - if (treeLineElements[0] === element || treeLineElements[1] === element) { - return; - } + const patternFiles = patternInput.value + ? patternInput.value.split(',').map((item) => item.trim()) + : []; element.classList.toggle('line-through'); element.classList.toggle('text-gray-500'); @@ -54,38 +52,42 @@ function toggleFile(element) { patternInput.value = patternFiles.join(', '); } -// Copy functionality function copyText(className) { let textToCopy; if (className === 'directory-structure') { - // For directory structure, get the hidden input value - const hiddenInput = document.getElementById('directory-structure-content'); + const hiddenInput = document.getElementById( + 'directory-structure-content' + ); - if (!hiddenInput) {return;} + if (!hiddenInput) { + return; + } textToCopy = hiddenInput.value; } else { - // For other elements, get the textarea value - const textarea = document.querySelector(`.${ className }`); + const textarea = document.querySelector(`.${className}`); - if (!textarea) {return;} + if (!textarea) { + return; + } textToCopy = textarea.value; } - const button = document.querySelector(`button[onclick="copyText('${className}')"]`); + const button = document.querySelector( + `button[onclick="copyText('${className}')"]` + ); - if (!button) {return;} + if (!button) { + return; + } - // Copy text - navigator.clipboard.writeText(textToCopy) + navigator.clipboard + .writeText(textToCopy) .then(() => { - // Store original content const originalContent = button.innerHTML; - // Change button content button.innerHTML = 'Copied!'; - // Reset after 1 second setTimeout(() => { button.innerHTML = originalContent; }, 1000); @@ -101,7 +103,6 @@ function copyText(className) { }); } -// Helper functions for toggling result blocks function showLoading() { document.getElementById('results-loading').style.display = 'block'; document.getElementById('results-section').style.display = 'none'; @@ -121,7 +122,6 @@ function showError(msg) { errorDiv.style.display = 'block'; } -// Helper function to collect form data function collectFormData(form) { const json_data = {}; const inputText = form.querySelector('[name="input_text"]'); @@ -130,28 +130,40 @@ function collectFormData(form) { const patternType = document.getElementById('pattern_type'); const pattern = document.getElementById('pattern'); - if (inputText) {json_data.input_text = inputText.value;} - if (token) {json_data.token = token.value;} - if (hiddenInput) {json_data.max_file_size = hiddenInput.value;} - if (patternType) {json_data.pattern_type = patternType.value;} - if (pattern) {json_data.pattern = pattern.value;} + if (inputText) { + json_data.input_text = inputText.value; + } + if (token) { + json_data.token = token.value; + } + if (hiddenInput) { + json_data.max_file_size = hiddenInput.value; + } + if (patternType) { + json_data.pattern_type = patternType.value; + } + if (pattern) { + json_data.pattern = pattern.value; + } return json_data; } -// Helper function to manage button loading state function setButtonLoadingState(submitButton, isLoading) { if (!isLoading) { submitButton.disabled = false; - submitButton.innerHTML = submitButton.getAttribute('data-original-content') || 'Submit'; + submitButton.innerHTML = + submitButton.getAttribute('data-original-content') || 'Submit'; submitButton.classList.remove('bg-[#ffb14d]'); return; } - // Store original content if not already stored if (!submitButton.getAttribute('data-original-content')) { - submitButton.setAttribute('data-original-content', submitButton.innerHTML); + submitButton.setAttribute( + 'data-original-content', + submitButton.innerHTML + ); } submitButton.disabled = true; @@ -167,44 +179,81 @@ function setButtonLoadingState(submitButton, isLoading) { submitButton.classList.add('bg-[#ffb14d]'); } -// Helper function to handle successful response function handleSuccessfulResponse(data) { - // Show results section showResults(); - // Store the digest_url for download functionality window.currentDigestUrl = data.digest_url; - // Set plain text content for summary, tree, and content document.getElementById('result-summary').value = data.summary || ''; - document.getElementById('directory-structure-content').value = data.tree || ''; + document.getElementById('directory-structure-content').value = + data.tree || ''; document.getElementById('result-content').value = data.content || ''; - // Populate directory structure lines as clickable
 elements
     const dirPre = document.getElementById('directory-structure-pre');
 
     if (dirPre && data.tree) {
         dirPre.innerHTML = '';
-        data.tree.split('\n').forEach((line) => {
-            const pre = document.createElement('pre');
-
-            pre.setAttribute('name', 'tree-line');
-            pre.className = 'cursor-pointer hover:line-through hover:text-gray-500';
-            pre.textContent = line;
-            pre.onclick = function () { toggleFile(this); };
-            dirPre.appendChild(pre);
+        const lines = data.tree.split('\n');
+        const tokenRegex = /^(.+?)(\s+\([\d.,]+[kM]?\s+tokens\))$/;
+
+        lines.forEach((line, index) => {
+            const container = document.createElement('div');
+            container.className =
+                'flex justify-between cursor-pointer hover:line-through hover:text-gray-500';
+            container.setAttribute('name', 'tree-line');
+
+            const match = line.match(tokenRegex);
+            let namePart, tokenPart;
+
+            if (match) {
+                namePart = match[1];
+                tokenPart = match[2].trim();
+            } else {
+                namePart = line;
+                tokenPart = '';
+            }
+
+            const namePre = document.createElement('pre');
+            namePre.textContent = namePart;
+
+            container.appendChild(namePre);
+
+            if (tokenPart) {
+                const tokenSpan = document.createElement('span');
+                tokenSpan.textContent = tokenPart;
+                tokenSpan.style.color = '#6b7280';
+                tokenSpan.style.fontWeight = 'normal';
+                tokenSpan.style.whiteSpace = 'nowrap';
+
+                if (namePart.trim().endsWith('/')) {
+                    tokenSpan.style.fontWeight = 'bold';
+                }
+
+                container.appendChild(tokenSpan);
+            }
+
+            if (index >= 2) {
+                container.onclick = function () {
+                    toggleFile(this);
+                };
+            }
+
+            dirPre.appendChild(container);
         });
     }
 
-    // Scroll to results
-    document.getElementById('results-section').scrollIntoView({ behavior: 'smooth', block: 'start' });
+    document
+        .getElementById('results-section')
+        .scrollIntoView({ behavior: 'smooth', block: 'start' });
 }
 
 function handleSubmit(event, showLoadingSpinner = false) {
     event.preventDefault();
     const form = event.target || document.getElementById('ingestForm');
 
-    if (!form) {return;}
+    if (!form) {
+        return;
+    }
 
     // Ensure hidden input is updated before collecting form data
     const slider = document.getElementById('file_size');
@@ -220,7 +269,9 @@ function handleSubmit(event, showLoadingSpinner = false) {
 
     const submitButton = form.querySelector('button[type="submit"]');
 
-    if (!submitButton) {return;}
+    if (!submitButton) {
+        return;
+    }
 
     const json_data = collectFormData(form);
 
@@ -232,7 +283,7 @@ function handleSubmit(event, showLoadingSpinner = false) {
     fetch('/api/ingest', {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify(json_data)
+        body: JSON.stringify(json_data),
     })
         .then(async (response) => {
             let data;
@@ -247,21 +298,33 @@ function handleSubmit(event, showLoadingSpinner = false) {
             if (!response.ok) {
                 // Show all error details if present
                 if (Array.isArray(data.detail)) {
-                    const details = data.detail.map((d) => `
  • ${d.msg || JSON.stringify(d)}
  • `).join(''); + const details = data.detail + .map((d) => `
  • ${d.msg || JSON.stringify(d)}
  • `) + .join(''); - showError(`
    Error(s):
    `); + showError( + `
    Error(s):
    ` + ); return; } // Other errors - showError(`
    ${data.error || JSON.stringify(data) || 'An error occurred.'}
    `); + showError( + `
    ${ + data.error || + JSON.stringify(data) || + 'An error occurred.' + }
    ` + ); return; } // Handle error in data if (data.error) { - showError(`
    ${data.error}
    `); + showError( + `
    ${data.error}
    ` + ); return; } @@ -270,29 +333,35 @@ function handleSubmit(event, showLoadingSpinner = false) { }) .catch((error) => { setButtonLoadingState(submitButton, false); - showError(`
    ${error}
    `); + showError( + `
    ${error}
    ` + ); }); } function copyFullDigest() { - const directoryStructure = document.getElementById('directory-structure-content').value; + const directoryStructure = document.getElementById( + 'directory-structure-content' + ).value; const filesContent = document.querySelector('.result-text').value; const fullDigest = `${directoryStructure}\n\nFiles Content:\n\n${filesContent}`; const button = document.querySelector('[onclick="copyFullDigest()"]'); const originalText = button.innerHTML; - navigator.clipboard.writeText(fullDigest).then(() => { - button.innerHTML = ` + navigator.clipboard + .writeText(fullDigest) + .then(() => { + button.innerHTML = ` Copied! `; - setTimeout(() => { - button.innerHTML = originalText; - }, 2000); - }) + setTimeout(() => { + button.innerHTML = originalText; + }, 2000); + }) .catch((err) => { console.error('Failed to copy text: ', err); }); @@ -346,7 +415,7 @@ function logSliderToSize(position) { const maxPosition = 500; const maxValue = Math.log(102400); // 100 MB - const value = Math.exp(maxValue * (position / maxPosition)**1.5); + const value = Math.exp(maxValue * (position / maxPosition) ** 1.5); return Math.round(value); } @@ -357,13 +426,17 @@ function initializeSlider() { const sizeValue = document.getElementById('size_value'); const hiddenInput = document.getElementById('max_file_size_kb'); - if (!slider || !sizeValue || !hiddenInput) {return;} + if (!slider || !sizeValue || !hiddenInput) { + return; + } function updateSlider() { const value = logSliderToSize(slider.value); sizeValue.textContent = formatSize(value); - slider.style.backgroundSize = `${(slider.value / slider.max) * 100}% 100%`; + slider.style.backgroundSize = `${ + (slider.value / slider.max) * 100 + }% 100%`; hiddenInput.value = value; // Set hidden input to KB value } @@ -377,10 +450,10 @@ function initializeSlider() { // Add helper function for formatting size function formatSize(sizeInKB) { if (sizeInKB >= 1024) { - return `${ Math.round(sizeInKB / 1024) }MB`; + return `${Math.round(sizeInKB / 1024)}MB`; } - return `${ Math.round(sizeInKB) }kB`; + return `${Math.round(sizeInKB)}kB`; } // Add this new function @@ -402,7 +475,6 @@ document.addEventListener('DOMContentLoaded', () => { setupGlobalEnterHandler(); }); - // Make sure these are available globally window.handleSubmit = handleSubmit; window.toggleFile = toggleFile;