#!/usr/bin/env python3
"""
Word frequency analysis across SillyTavern chat files.
Searches all swipes in the newest chat file per character folder.

Usage:
    python word_frequency.py
    (prompts for search term)
"""

import os
import sys
import glob
import json
from collections import defaultdict
from datetime import datetime

def parse_date(date_str):
    """Parse SillyTavern date formats."""
    for fmt in ['%B %d, %Y %I:%M%p', '%B %d, %Y %I:%M %p', '%Y-%m-%dT%H:%M:%S.%fZ']:
        try:
            return datetime.strptime(date_str.strip(), fmt)
        except:
            continue
    return None

def analyze_word(search_word, base_dir=None):
    """Analyze word frequency across all chat files."""
    if base_dir is None:
        base_dir = os.path.dirname(os.path.abspath(__file__))

    by_model = defaultdict(lambda: {'count': 0, 'tokens': 0})
    by_month = defaultdict(lambda: {'count': 0, 'tokens': 0})
    by_char = defaultdict(lambda: {'count': 0, 'tokens': 0})

    search_lower = search_word.lower()
    files_checked = 0

    for char_dir in os.listdir(base_dir):
        char_path = os.path.join(base_dir, char_dir)
        if not os.path.isdir(char_path):
            continue

        chat_files = glob.glob(os.path.join(char_path, '*.jsonl'))
        if not chat_files:
            continue

        newest = max(chat_files, key=os.path.getmtime)
        files_checked += 1
        char_name = char_dir

        with open(newest, 'r', encoding='utf-8') as f:
            f.readline()  # skip metadata

            for line in f:
                try:
                    msg = json.loads(line)
                    swipes = msg.get('swipes', [])
                    text = ' '.join(s for s in swipes if s) if swipes else (msg.get('mes') or '')
                    tokens = len(text) // 4

                    is_user = msg.get('is_user', False)
                    extra = msg.get('extra') or {}
                    model = extra.get('model', '')

                    if is_user:
                        source = '[USER]'
                    elif model:
                        source = model
                    else:
                        source = '[AI-no-model]'

                    date_str = msg.get('send_date', '')
                    dt = parse_date(date_str)
                    month = dt.strftime('%Y-%m') if dt else 'unknown'

                    cnt = text.lower().count(search_lower)

                    by_model[source]['count'] += cnt
                    by_model[source]['tokens'] += tokens
                    by_month[month]['count'] += cnt
                    by_month[month]['tokens'] += tokens
                    if cnt > 0:
                        by_char[char_name]['count'] += cnt
                except:
                    continue

    return by_model, by_month, by_char, files_checked

def print_results(search_word, by_model, by_month, by_char, files_checked):
    """Print formatted results."""
    total_c = sum(d['count'] for d in by_model.values())
    total_t = sum(d['tokens'] for d in by_model.values())

    print(f'\nSearching for: "{search_word}"')
    print(f'Files checked: {files_checked} (newest per character)')
    print(f'Total tokens: {total_t/1e6:.2f}M')
    print()

    print('Model                                        Count     MTok   per 1M')
    print('=' * 72)

    # Models with matches (sorted by count)
    for src, d in sorted(by_model.items(), key=lambda x: -x[1]['count']):
        if d['count'] > 0:
            pm = d['count'] / d['tokens'] * 1e6 if d['tokens'] else 0
            print(f'{src:<44} {d["count"]:>5}   {d["tokens"]/1e6:>6.2f}   {pm:>6.1f}')

    print('-' * 72)

    # Models with 0 matches (sorted by tokens, top ones)
    zero_models = [(src, d) for src, d in by_model.items() if d['count'] == 0]
    zero_models.sort(key=lambda x: -x[1]['tokens'])
    for src, d in zero_models[:10]:
        if d['tokens'] > 50000:  # Only show significant ones
            print(f'{src:<44}     0   {d["tokens"]/1e6:>6.2f}      0.0')

    print('=' * 72)
    print(f'{"TOTAL":<44} {total_c:>5}   {total_t/1e6:>6.2f}   {total_c/total_t*1e6:>6.1f}')

    # By month (non-zero only)
    months_with_hits = [(m, d) for m, d in by_month.items() if d['count'] > 0 and m != 'unknown']
    if months_with_hits:
        print('\nBy Month (non-zero):')
        print('-' * 40)
        for m, d in sorted(months_with_hits):
            pm = d['count'] / d['tokens'] * 1e6 if d['tokens'] else 0
            bar = '*' * min(d['count'], 40)
            print(f'{m}: {d["count"]:>3} ({pm:>5.1f}/M)  {bar}')

    # By character
    if by_char:
        print('\nBy Character:')
        print('-' * 40)
        for c, d in sorted(by_char.items(), key=lambda x: -x[1]['count']):
            print(f'  {c}: {d["count"]}')

def main():
    search_word = input('Enter word/phrase to search: ').strip()
    if not search_word:
        print('No search term entered.')
        sys.exit(1)

    by_model, by_month, by_char, files_checked = analyze_word(search_word)
    print_results(search_word, by_model, by_month, by_char, files_checked)

if __name__ == '__main__':
    main()