In [1]:
#@markdown # Step 1: Load the data

ppp_path = "/gdrive/MyDrive/pony-preservation-project/" #@param {type:"string"}
#@markdown If this crashes, just try running it again. \
#@markdown If you haven't added the pony-preservation-project folder to your drive, you need to do that first.
#@markdown - Go here: https://drive.google.com/drive/u/2/folders/1MuM9Nb_LwnVxInIPFNvzD_hv3zOZhpwx
#@markdown - Click the "pony-preservation-project" part of the path
#@markdown - Select "Add shortcut to Drive"
#@markdown - Update the ppp_path in the text field above to the shortcut location

#@markdown ![Add shortcut to drive](https://u.smutty.horse/mewskbspdxa.png)

import os
if not os.path.exists('/gdrive'):
    print('Mounting google drive')
    from google.colab import drive
    drive.mount('/gdrive')

print('Installing dependencies')
!rm -r datasets
!git clone https://github.com/synthbot-anon/datasets.git
!pip install -r datasets/requirements.txt
from datasets.fimfarchive import Fimfarchive, TemplatedStoryString, read_chapter

import glob
fimfarchive_path_candidates = glob.glob(f'{ppp_path}/story-data/fimfarchive - *')
assert fimfarchive_path_candidates, "You need to add the pony-preservation-project folder to your google drive."

fimfarchive_path_candidates.sort(key=os.path.getmtime)
fimfarchive_path = os.path.normpath(fimfarchive_path_candidates[-1])
print('fimfarchive_path:', fimfarchive_path)
print('loading the fimfarchive')
fimfarchive = Fimfarchive(fimfarchive_path)
print('loading the txt cache')
read_chapter(fimfarchive.chapter_texts, '9', 0)

baseline_dump_template = r"""<|info|>
title: {.title}
author: {.author.name}
tags: {join .tags.type ":" .tags.name with ", "}

<|startoftext|>
{join "=== " .chapters.title " ===\n" chapter_text with "\n" * 4}
<|endoftext|>"""

dump_template_file = f"/tmp/fimfarchive-dump-template.txt"

if not os.path.exists(dump_template_file):
    print('writing baseline template files to', dump_template_file)
    with open(dump_template_file, "w") as template:
        template.write(baseline_dump_template)

dump_template_help_contents = r"""Any text outside of curly braces will be treated as normal text.

Inside curly braces, you can add story data and text.

    Title: {.title}
    Author: {.author.name}
    Link: {.url}

Any field you see in the fimfarchive index.json file is accessible inside the
curly braces. Just begin the field with a dot . and enter the full path to the
field you want.

    {.archive.date_updated}

Inside {}, you can interleave story data and regular text.

    { "=== " .title " ===" }



As a shortcut, you can also use * to repeat some text.

    { "\n"*2 .title "\n" .author.name "\n"*2}

There are two special fields: .tags and .chapters. They're special because They
can contain multiple items in them. If you want to use these inside curly
braces, you need to specify how to combine all of the items. You can do this
with a {join}.

This will put together all of the story tag names separated with a comma.

    { join .tags.name with ", "}

And this will create a list, one per line, of each tag type and tag name.

    {join .tags.type ": " .tags.name with "\n"}

The resulting list would look something like this:

    character: Rainbow Dash
    character: Twilight Sparkle
    genre: Adventure
    genre: Romance
    series: My Little Pony: Friendship is Magic

Lastly, there's a special symbol chapter_text for accessing the story text,
chapter by chapter. You can use it like any other field inside a {join}.

    { join .chapters.title "\n" chapter_text with "\n"*4 }

That's all. Try copy/pasting this template into the actual template file
and running the cell to see what it looks like. Make sure to select the "sample"
checkbox so it doesn't take too long.
"""

dump_template_help = f"/tmp/fimfarchive-dump-template-help.txt"
if not os.path.exists(dump_template_help):
  with open(dump_template_help, "w") as helpfile:
    helpfile.write(dump_template_help_contents)

print('Done')

Mounting google drive
Mounted at /gdrive
Installing dependencies
rm: cannot remove 'datasets': No such file or directory
Cloning into 'datasets'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 35 (delta 16), reused 29 (delta 10), pack-reused 0[K
Unpacking objects: 100% (35/35), 11.81 KiB | 806.00 KiB/s, done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lark
  Downloading lark-1.1.5-py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting EbookLib
  Downloading EbookLib-0.18.tar.gz (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.5/115.5 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages:

In [None]:
import json

date = "date_published"
dash = ("-")
#@markdown # Optional: Search for stories
tag_query = "trixie, -equestria girls, -character:main 7, -eqg, -a new generation, -crossover, -gore, -random, -human, -anthro, -character:young six, -gallus, -yona, -sky beak, -terramar, -smolder, -silverstream, -sandbar, -ocean flow, -ocellus" #@param {type:"string"}
####numerical_query = "\"('date'.split(dash))\" \u003C \"2013\"" #@param {type:"string"}
###numerical_query = "\"({}.split(dash))[0]\" \u003C \"2013\"".format("date_published") #@param {type:"string"}
#numerical_query = f"\"({''.join(date.split(dash))})\" \u003C \"2013\"" #@param {type:"string"}

numerical_query = "{\"lower\": \"2011-01-01\", \"upper\": \"2011-12-31\"}" #@param {type:"string"}

numerical_query = json.loads(numerical_query)
lower_date = numerical_query["lower"].replace("-", "")
upper_date = numerical_query["upper"].replace("-", "")
numerical_query = {"lower": lower_date, "upper": upper_date}


#numerical_query = f"{numerical_query}"


tmpremove="""
if tag_query and numerical_query:
  search_terms = ', '.join([tag_query.strip(), numerical_query.strip()])
else:
  #search_terms = tag_query or numerical_query
  search_terms = numerical_query
"""

search_terms = ', '.join([tag_query.strip()])

max_results =  5#@param {type:"integer"}

random_sample = True #@param {type:"boolean"}
add_fic_date_NONE = True #@param {type:"boolean"}

#@markdown ### Search options:
#@markdown - **Require multiple tags:** `twilight sparkle, celestia`. This is intuitively the same as a boolean "AND", and it translates logically to a set intersection. The returned results will match both requirements left and right of the comma.
#@markdown - **Reject a tag**: `-celestia`. This is intuitively the same as a boolean "NOT" and it translates logically to a set complement. The returned results will not match the negated requirement.
#@markdown - **Allow either tag:** `twilight | celestia`. This is intuitively the same as a boolean "OR", and it translates logically to a set union. The returned results will match either of the two requirements left or right of the comma.
#@markdown - **Group query parts:** `twilight, -(celestia | luna)`.
#@markdown - **Restrict to a category:** `character: non` (allowed categories: character, genre, series, content, warning). This limits the scope of a tag-based requirement to just one type of tag. If you search just `non` by itself, any story with tags `winona`, `anon`, or `non-consensual` will match. If you search `character: non`, only `winona` and `anon` will match.
#@markdown - **Restrict by likes/dislikes/wordcount:** `.ratio * .likes > .wordcount`. Any of these restrictions must be based on some sort of comparison, which includes `>`, `<`, `>=`, `<=`, and `=`. On the left and right sides of the comparison, you can use the operations `+, -, *, /, ^, min(...), max(...)` and the special symbols `.likes, .dislikes, .wordcount, .ratio`. The arithmetic operations will follow their standard order of operations. You can group operations using parentheses. See the Notes below for details on how to access more special symbols.
#@markdown - **Use special characters:** `"p0n-3"` with the surrounding double-quotes. If you use anything other than letters and numbers (e.g., a dash), you'll need to surround it with double-quotes.
#@markdown - The `numerical_query` need to be twitten in this format _{"lower": "2011-01-01", "upper": "2011-12-31"}_ to work, with first number YYYY-MM-DD being lowet than the second, if you dont care about an speciic date search just put very low and large numbers like 1900-01-01 and 3333-12-31.
#@markdown - The `add_fic_date_NONE` due to brokeness of fimfic, some stories fit the 'tag_query' BUT do not have any publication date atteched to themselves, so you have a choice of adding those stories in your collection, e.g. Looking for stories 2011~2012 it may grab few random ones from 2017+, BUT the more details/info you put in the tag_query the less likely the code will grab random crap

#@markdown ### Notes:

#@markdown - Use a `max_results` value of 0 to get all results.
#@markdown - Be careful when searching for characters. Sometimes FimFiction uses collective tags, like `Main 6`. In these cases, a story tagged with `Main 6` may contain `Twilight Sparkle` even though it does not have a `Twilight Sparkle` tag. This search does not automatically expand tags like `Main 6` into all of their implied sub-tags.
#@markdown - If there is no like/dislike data, both are assumed to be -1.
#@markdown - `.ratio = .likes / .dislikes` where both values clip to 0.5. So if there are 10 likes and 0 dislikes, the ratio is treated as 20.
#@markdown - There's no difference between the two query fields. Only stories that pass both filters will get displayed.
#@markdown - In addition to `.likes`, `.dislikes`, `.wordcount`, and `.ratio`, you can use most fields in the fimfarchive's index.json. For example, `.author.name`, `.num_views`, and so on. The symbol name here is based on the JSON path in index.json. You can't reference per-chapter information or per-tag information using this "dot" notation. You also can't access information that the fimfarchive tracks inconsistently, like `.author.num_followers`, though this might get fixed if I decide to clean up the fimfarchive data.


import random

def gen_results(search_terms):
  for story_id in fimfarchive.query_stories(search_terms):
    yield story_id

#get resould + ccheck the date
def gen_results_date(search_terms, numerical_query=None, add_fic_date_NONE=False):
    for story_id in fimfarchive.query_stories(search_terms):
        data = fimfarchive.stories_by_id[story_id]
        if numerical_query is not None:
            date_published = data["date_published"]
            if date_published is not None and numerical_query["lower"] is not None:
                lower_bound = int(numerical_query["lower"])
                date_published = date_published.split("T")[0].replace("-", "")
                if int(date_published) < lower_bound:
                    continue
            if date_published is not None and numerical_query["upper"] is not None:
                upper_bound = int(numerical_query["upper"])
                date_published = date_published.split("T")[0].replace("-", "")
                if int(date_published) > upper_bound:
                    continue
        elif not add_fic_date_NONE and data["date_published"] is None:
            continue
        yield story_id

def print_result(story_id):
  data = fimfarchive.stories_by_id[story_id]
  title = data['title']
  tags = ', '.join([x['name'] for x in data['tags']])
  author = data['author']['name']
  url = data['url']
  print(f'{title} ({data["completion_status"]})')
  print(f'author: {author}')
  print(f'tags: {tags}')
  print(f'{data["num_words"]} words, {data["num_likes"]} likes, {data["num_dislikes"]} dislikes')
  print(f'link: {url}')
  print(f'date: {data["date_published"]}')
  print('story ID: ' + str(story_id))
  print()

#results = list(gen_results(search_terms))


results_time = list(gen_results_date(search_terms, numerical_query, add_fic_date_NONE))

results = results_time

if random_sample:
  #random.shuffle(results)
  random.shuffle(results)

if max_results > 0:
    results = list(results)[:max_results]

if random_sample and numerical_date_search:
  random.shuffle(results_time)

for id in results:
  print_result(id)
  




In [4]:
#@markdown # Optional: Search for tags

search_terms = "date" #@param {type:"string"}

#@markdown ### Notes:
#@markdown The search options are the same as for stories. The only difference is that this searches the set of tags on fimfiction, not stories with certain tags.

results = fimfarchive.query_tags(search_terms)
for category, tags in fimfarchive.tags_by_type.items():
  for tag_id, tag_name in tags.items():
    if tag_id in results:
      tag_name = tags[tag_id]
      print(f'{category}:{tag_name}')





In [149]:
#@markdown # Download stories
date="date_published"
dash=("-")
tag_query = "Trixie, -equestria girls, -character:main 7, -eqg, -a new generation, -crossover, -gore, -random, -human, -anthro, -character:young six, -gallus, -yona, -sky beak, -terramar, -smolder, -silverstream, -sandbar, -ocean flow, -ocellus,  -character:sunset shimmer, -character:zecora" #@param {type:"string"}
search_terms = ', '.join([tag_query.strip()])

numerical_query = "{\"lower\": \"2011-01-01\", \"upper\": \"2011-12-31\"}" #@param {type:"string"}

numerical_query = json.loads(numerical_query)
lower_date = numerical_query["lower"].replace("-", "")
upper_date = numerical_query["upper"].replace("-", "")
numerical_query = {"lower": lower_date, "upper": upper_date}



format = "text dump" #@param ["text dump", "zip"]
#@markdown - **customize text dump:** /tmp/fimfarchive-dump-template.txt
sample = False #@param {type:"boolean"}


add_fic_date_NONE = True #@param {type:"boolean"}

#@markdown **Text options**
consistent_quotes = True #@param {type:"boolean"}

#@markdown ### Notes:
#@markdown - You can use the story search cell above to see which stories will be downloaded. The search options work the same way.
#@markdown - The "sample" option means download the first 3 results. It's there so you can sanity check the results without being hit with a multi-gigabye download.
#@markdown - "consistent_quotes" will replace fancy unicode quotes with boring ascii ones.
#@markdown - See /tmp/fimfarchive-dump-template-help.txt for an explanation of how to format the template file.


###
import os
import random
import re
from tqdm import tqdm
import itertools
from joblib import Parallel, delayed
from google.colab import files
import zipfile
import tarfile
import json

from datasets.fimfarchive import TemplatedStoryString
templated_dump = TemplatedStoryString(fimfarchive, consistent_quotes)


def gen_results_date(search_terms, numerical_query=None, add_fic_date_NONE=False):
    for story_id in fimfarchive.query_stories(search_terms):
        data = fimfarchive.stories_by_id[story_id]
        if numerical_query is not None:
            date_published = data["date_published"]
            if date_published is not None and numerical_query["lower"] is not None:
                lower_bound = int(numerical_query["lower"])
                date_published = date_published.split("T")[0].replace("-", "")
                if int(date_published) < lower_bound:
                    continue
            if date_published is not None and numerical_query["upper"] is not None:
                upper_bound = int(numerical_query["upper"])
                date_published = date_published.split("T")[0].replace("-", "")
                if int(date_published) > upper_bound:
                    continue
        elif not add_fic_date_NONE and data["date_published"] is None:
            continue
        yield story_id


results = list(gen_results_date(search_terms, numerical_query, add_fic_date_NONE))
if sample:
  results = itertools.islice(results, 3)

results = tqdm(results, desc='dumping stories', unit='fic', position=0, leave=True)

with open('/tmp/fimfarchive-dump-template.txt', encoding='utf8') as template_file:
  template = template_file.read()

os.makedirs('/tmp/fimfarchive-dump/', exist_ok=True)
for story_id in results:
  output_path = f'/tmp/fimfarchive-dump/{story_id}.txt'
  try:
    with open(output_path, 'w') as output:
      result = templated_dump.parse(template, story_id)
      output.write(result) 
  except KeyboardInterrupt:
    raise
  except:
    print('failed to dump story', story_id)
    os.remove(output_path)

if format == 'zip':
  with zipfile.ZipFile("/tmp/fimfarchive-dump.zip", "w") as output:
    for id in tqdm(results, desc='zipping', unit='fic', position=0, leave=True):
      path = f'/tmp/fimfarchive-dump/{id}.txt'
      if not os.path.exists(path):
        continue
      output.write(path, f'{id}.txt')
  files.download('/tmp/fimfarchive-dump.zip', )
else:
  with open('/tmp/fimfarchive-dump.txt', 'w') as output:
    for id in tqdm(results, desc='writing', unit='fic', position=0, leave=True):
      input_path = f'/tmp/fimfarchive-dump/{id}.txt'
      if not os.path.exists(input_path):
        continue
      with open(input_path) as input:
        output.write(input.read())
  files.download('/tmp/fimfarchive-dump.txt')

print('Done')


dumping stories: 100%|██████████| 91/91 [00:57<00:00,  1.59fic/s]
writing: 100%|██████████| 91/91 [00:00<00:00, 1854.96fic/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Done
