#!/usr/bin/env python3 """ Word frequency analysis across SillyTavern chat files. Searches all swipes in the newest chat file per character folder. Usage: python word_frequency.py (prompts for search term) """ import os import sys import glob import json from collections import defaultdict from datetime import datetime def parse_date(date_str): """Parse SillyTavern date formats.""" for fmt in ['%B %d, %Y %I:%M%p', '%B %d, %Y %I:%M %p', '%Y-%m-%dT%H:%M:%S.%fZ']: try: return datetime.strptime(date_str.strip(), fmt) except: continue return None def analyze_word(search_word, base_dir=None): """Analyze word frequency across all chat files.""" if base_dir is None: base_dir = os.path.dirname(os.path.abspath(__file__)) by_model = defaultdict(lambda: {'count': 0, 'tokens': 0}) by_month = defaultdict(lambda: {'count': 0, 'tokens': 0}) by_char = defaultdict(lambda: {'count': 0, 'tokens': 0}) search_lower = search_word.lower() files_checked = 0 for char_dir in os.listdir(base_dir): char_path = os.path.join(base_dir, char_dir) if not os.path.isdir(char_path): continue chat_files = glob.glob(os.path.join(char_path, '*.jsonl')) if not chat_files: continue newest = max(chat_files, key=os.path.getmtime) files_checked += 1 char_name = char_dir with open(newest, 'r', encoding='utf-8') as f: f.readline() # skip metadata for line in f: try: msg = json.loads(line) swipes = msg.get('swipes', []) text = ' '.join(s for s in swipes if s) if swipes else (msg.get('mes') or '') tokens = len(text) // 4 is_user = msg.get('is_user', False) extra = msg.get('extra') or {} model = extra.get('model', '') if is_user: source = '[USER]' elif model: source = model else: source = '[AI-no-model]' date_str = msg.get('send_date', '') dt = parse_date(date_str) month = dt.strftime('%Y-%m') if dt else 'unknown' cnt = text.lower().count(search_lower) by_model[source]['count'] += cnt by_model[source]['tokens'] += tokens by_month[month]['count'] += cnt by_month[month]['tokens'] += tokens if cnt > 0: by_char[char_name]['count'] += cnt except: continue return by_model, by_month, by_char, files_checked def print_results(search_word, by_model, by_month, by_char, files_checked): """Print formatted results.""" total_c = sum(d['count'] for d in by_model.values()) total_t = sum(d['tokens'] for d in by_model.values()) print(f'\nSearching for: "{search_word}"') print(f'Files checked: {files_checked} (newest per character)') print(f'Total tokens: {total_t/1e6:.2f}M') print() print('Model Count MTok per 1M') print('=' * 72) # Models with matches (sorted by count) for src, d in sorted(by_model.items(), key=lambda x: -x[1]['count']): if d['count'] > 0: pm = d['count'] / d['tokens'] * 1e6 if d['tokens'] else 0 print(f'{src:<44} {d["count"]:>5} {d["tokens"]/1e6:>6.2f} {pm:>6.1f}') print('-' * 72) # Models with 0 matches (sorted by tokens, top ones) zero_models = [(src, d) for src, d in by_model.items() if d['count'] == 0] zero_models.sort(key=lambda x: -x[1]['tokens']) for src, d in zero_models[:10]: if d['tokens'] > 50000: # Only show significant ones print(f'{src:<44} 0 {d["tokens"]/1e6:>6.2f} 0.0') print('=' * 72) print(f'{"TOTAL":<44} {total_c:>5} {total_t/1e6:>6.2f} {total_c/total_t*1e6:>6.1f}') # By month (non-zero only) months_with_hits = [(m, d) for m, d in by_month.items() if d['count'] > 0 and m != 'unknown'] if months_with_hits: print('\nBy Month (non-zero):') print('-' * 40) for m, d in sorted(months_with_hits): pm = d['count'] / d['tokens'] * 1e6 if d['tokens'] else 0 bar = '*' * min(d['count'], 40) print(f'{m}: {d["count"]:>3} ({pm:>5.1f}/M) {bar}') # By character if by_char: print('\nBy Character:') print('-' * 40) for c, d in sorted(by_char.items(), key=lambda x: -x[1]['count']): print(f' {c}: {d["count"]}') def main(): search_word = input('Enter word/phrase to search: ').strip() if not search_word: print('No search term entered.') sys.exit(1) by_model, by_month, by_char, files_checked = analyze_word(search_word) print_results(search_word, by_model, by_month, by_char, files_checked) if __name__ == '__main__': main()