# Use 4chan's API to build a large filter of images containing Amelia Watson, # by scraping the Amelia Watson split thread for image MD5 hashes. # # Best used run at a regular interval through cron jobs on Linux or task scheduler # on Windows. import requests import json import re import time import os def get_catalog_and_search_for_thread(): r_catalog = requests.get("https://a.4cdn.org/vt/catalog.json") if not r_catalog.status_code == 200: print("Error: could not retrieve the catalog from 4chan's API endpoint!") exit(-1) else: print("Retrieved the /vt/ catalog.") target_thread_no = None j_catalog = r_catalog.json() for page in j_catalog: for thread in page["threads"]: if "com" not in thread.keys(): break comment = thread["com"].lower() pattern = re.compile("amelia\swatson\sappreciation") if pattern.match(comment): target_thread_no = thread["no"] return target_thread_no max_attempts = 5 delay_between_attempts = 1 for i in range(max_attempts): target_thread_no = get_catalog_and_search_for_thread() if target_thread_no: print(f"Hit thread no. {target_thread_no}, next request will retrieve posts...") break else: print( f"Couldn't find matching thread, trying again in {delay_between_attempts} minutes..." ) time.sleep(delay_between_attempts * 60) if not target_thread_no: print("Couldn't find the thread to scrape for MD5 hashes, exiting.") exit(-1) # 4chan API does not allow more than 1 request per second time.sleep(1.0) md5s = [] r_thread = requests.get(f"https://a.4cdn.org/vt/thread/{target_thread_no}.json") j_thread = r_thread.json() for post in j_thread["posts"]: if "md5" in post.keys(): md5s.append(f'/{post["md5"]}/') print(f"Hit {len(md5s)} suspected Amelia Watson images.") print(f"Opening filter.txt to write them...") mode = "r+" if os.path.exists("filter.txt") else "w" if mode == "r+": with open("filter.txt", mode=mode) as filterfile: filterlines = filterfile.readlines() else: filterlines = [] with open("filter.txt", mode=mode) as filterfile: for existing_filterline in filterlines: for md5 in md5s: md5_filterline = f"{md5}\n" if md5_filterline == existing_filterline: md5s.remove(md5) break filterfile.write(existing_filterline) for md5 in md5s: md5_filterline = f"{md5}\n" filterfile.write(md5_filterline) print(f"Wrote {len(md5s)} new suspected Amelia Watson images to filter.")