In [None]:
#CELL 1
#@title Keep this widget playing to prevent Colab from disconnecting you { display-mode: "form" }
#@markdown Press play on the audio player that will appear below:
%%html
<audio src="https://oobabooga.github.io/silence.m4a" controls>

In [None]:
import requests
import tarfile
import os
import time
import re
import threading
from google.oauth2.service_account import Credentials
import hashlib
import gspread

#@title # **Koboldcpp 1.44 Colab (Improved Edition)**

#@markdown ---
#@markdown # Download Options

# URL of the built koboldcpp folder
url = "https://huggingface.co/kalomaze/ColabDependencies/resolve/main/koboldcpp.tar.gz"

Model = "MythoMax-L2-13B-GGUF" #@param ["MythoMax-L2-13B-GGUF", "ReMM-v2-L2-13B-GGUF", "ReMM-SLERP-L2-13B-GGUF", "Stheno-L2-13B-GGUF","MLewdBoros-L2-13B-GGUF"]
Quant_Method = "4_K_M" #@param ["3_K_L", "4_K_S", "4_K_M", "5_K_S", "5_K_M"]

#@markdown #### OPTIONAL: Manual Model Link
Use_Manual_Model = True #@param {type:"boolean"}
Manual_Link = "https://huggingface.co/Artefact2/Nous-Hermes-2-Mixtruct-v0.1-8x7B-DPO-DARE_TIES-GGUF/resolve/main/Nous-Hermes-2-Mixtruct-v0.1-8x7B-DPO-DARE_TIES-IQ2_XS.gguf?download=true" #@param {type:"string"}

#@markdown #### OPTIONAL: Use LoRA
Use_Lora = False #@param {type:"boolean"}
Lora_Link = "" #@param {type:"string"}

#@markdown ---

#@markdown # Launch Options

Layers = "99" #@param [43]{allow-input: true}
Context = "8192" #@param [4096]{allow-input: true}
Smart_Context = True #@param {type:"boolean"}

#@markdown ---

#@markdown ##### OPTIONAL: Build Latest Kobold (takes ~7 minutes)
Force_Update_Build = True #@param {type:"boolean"}

#@markdown ---

#@markdown # Analytics

def calculate_md5(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# Updates the spreadsheet with the stats of the model when ran
def update_llama_stats(DownloadedModel_path):
    # Initialize gspread
    scope = [
        'https://www.googleapis.com/auth/spreadsheets',
        'https://www.googleapis.com/auth/drive.file',
        'https://www.googleapis.com/auth/drive'
    ]

    os.makedirs("/content/koboldcpp/stats/", exist_ok=True)
    !wget -q https://cdn.discordapp.com/attachments/945486970883285045/1114717554481569802/peppy-generator-388800-07722f17a188.json -O /content/koboldcpp/stats/peppy-generator-388800-07722f17a188.json
    config_path = '/content/koboldcpp/stats/peppy-generator-388800-07722f17a188.json'

    if os.path.exists(config_path):
        # File exists, proceed with creation of creds and client
        creds = Credentials.from_service_account_file(config_path, scopes=scope)
        client = gspread.authorize(creds)
    else:
        # File does not exist, print message and skip creation of creds and client
        print("Sheet credential file missing.")
        exit()  # Exit the script if the credentials are missing

    # Open the Google Sheet
    book = client.open("LlamaStats")
    sheet = book.get_worksheet(0)  # get the first sheet

    DownloadedModel_name = os.path.basename(DownloadedModel_path)
    DownloadedModel_hash = calculate_md5("/content/koboldcpp/model.gguf")

    colA_values = sheet.col_values(1)
    colB_values = sheet.col_values(2)
    colC_values = sheet.col_values(3)

    update_idx = -1

    for idx in range(len(colA_values)):
        if colA_values[idx] == DownloadedModel_name and idx < len(colB_values) and colB_values[idx] == DownloadedModel_hash:
            update_idx = idx + 1
            break

    if update_idx == -1:
        update_idx = len(colA_values) + 1

    current_count = colC_values[update_idx - 1] if update_idx <= len(colC_values) else ''
    if current_count.isdigit():
        new_count = str(int(current_count) + 1)
    else:
        new_count = '1'

    # Batch update to Google Sheets
    cell_list = [
        gspread.models.Cell(update_idx, 1, DownloadedModel_name),
        gspread.models.Cell(update_idx, 2, DownloadedModel_hash),
        gspread.models.Cell(update_idx, 3, new_count),
        gspread.models.Cell(update_idx, 4, DownloadedModel_path)
    ]
    sheet.update_cells(cell_list)
    print("\nUpdating values...\n")

#@markdown ##### OPTIONAL: Submit Download stats (for measuring model usage/popularity)
Submit_Download_Stats = False #@param {type:"boolean"}

model_links = {
    "MythoMax-L2-13B-GGUF": "https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q{}.gguf",
    "ReMM-v2-L2-13B-GGUF": "https://huggingface.co/TheBloke/ReMM-v2-L2-13B-GGUF/resolve/main/remm-v2-l2-13b.Q{}.gguf",
    "ReMM-SLERP-L2-13B-GGUF": "https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q{}.gguf",
    "Stheno-L2-13B-GGUF": "https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q{}.gguf",
    "MLewdBoros-L2-13B-GGUF": "https://huggingface.co/TheBloke/MLewdBoros-L2-13B-GGUF/resolve/main/mlewdboros-l2-13b.Q{}.gguf"
}

if not os.path.exists('/content/koboldcpp/model.gguf'):
    # Use aria2c to download
    print("Installing/updating aria2c...")
    !apt-get install aria2 -y >/dev/null 2>&1
    print("Finished installing aria2c.")

    os.makedirs('/content/koboldcpp/', exist_ok=True)
    if Use_Lora:
      if Lora_Link.strip():
          # Lora is enabled & link provided
          print("\nLora detected, will apply to model.\n")
          lora = Lora_Link.replace('/blob/', '/resolve/')
      else:
          # Lora is enabled but no link
          print("\nWarning: Lora enabled, but no link, not applying.\n")
    if Use_Manual_Model:
        if Manual_Link.strip():
            # Manual Model is enabled, and a link is provided
            print(f"\nManual Model detected; will use {Manual_Link} instead of {Model}\n")
            Model = Manual_Link.replace('/blob/', '/resolve/')
        else:
            # Manual Model is enabled, but no link is provided
            print(f"\nWarning: Manual Model enabled, but no link was found. Falling back to {Model}\n")
            if Model in model_links:
                Model = model_links[Model].format(Quant_Method)
    else:
        # Model is in model_links and has a supported format
        Model = model_links[Model].format(Quant_Method)

    if not re.search(r'(\.gguf|\.ggml|\.bin|\.safetensors)$', Model):
        print("--------------------------\n5 SECOND WARNING: Manual link provided doesn't end with a supported format.\nAre you sure you provided a direct link?\n--------------------------\n")
        time.sleep(5)
    elif Model.startswith('https://huggingface.co/') and not re.search(r'^https://huggingface\.co/.+/.+/.+/.+/[^/]+\.[^/]+$', Model):
        print("--------------------------\n10 SECOND WARNING: The HuggingFace link provided is of the entire model repository.\nPlease find the direct link to the quant you want to use.\n--------------------------\n")
        time.sleep(10)

def download_model_and_lora():
    if not os.path.exists('/content/koboldcpp/model.gguf'):

        # Start timing
        start_time = time.time()

        print(f"\n--------------------------\nDownloading {os.path.basename(Model)}...")
        os.chdir("/content/koboldcpp")
        !aria2c -x 16 -s 16 -k 1M --allow-overwrite="true" --summary-interval=5 $Model -d /content/koboldcpp -o model.gguf 2>&1 | grep -Ev 'Redirecting'

        elapsed_time = time.time() - start_time # Calculate and display elapsed time
        print(f"\nDownload took {elapsed_time:.2f} seconds")

        if Use_Lora:
          print(f"\n--------------------------\nDownloading {os.path.basename(lora)}...")
          os.chdir("/content/koboldcpp")
          !aria2c -x 16 -s 16 -k 1M --allow-overwrite="true" --summary-interval=5 $Model -d /content/koboldcpp -o lora.bin 2>&1 | grep -Ev 'Redirecting'

        if os.path.exists('/content/koboldcpp/model.gguf') and os.path.getsize("/content/koboldcpp/model.gguf") == 0:
            os.remove("/content/koboldcpp/model.gguf")

        if os.path.exists('/content/koboldcpp/lora.bin') and os.path.getsize("/content/koboldcpp/lora.bin") == 0:
            os.remove("/content/koboldcpp/lora.bin")

        if Submit_Download_Stats and os.path.exists("/content/koboldcpp/model.gguf"):
            DownloadedModel = Model[:]  # DownloadedModel is used for download stats
            update_llama_stats(DownloadedModel)

        print("--------------------------\n")
    else:
         print("--------------------------\nModel already downloaded; skipping redownload.\nDisconnect and delete runtime if you need to restart the colab fully.\n--------------------------\n")

thread = threading.Thread(target=download_model_and_lora)

# Checking if you already have a Kobold install
if not os.path.exists("/content/koboldcpp/llama.cpp"):
    if not Force_Update_Build:
        print("--------------------------")
        print("Force_Update_Build is set to False. Proceeding...")
        print("Downloading & extracting prebuilt Koboldcpp 1.44...")

        thread.start()

        # Downloading file
        print(f"Starting download from: {url}")
        response = requests.get(url, stream=True)
        filename = url.split("/")[-1]
        with open(filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Downloaded file: {filename}")

        # Create the koboldcpp directory if it doesn't exist
        destination_path = '/content/koboldcpp/'
        if not os.path.exists(destination_path):
            os.makedirs(destination_path)

        # Extracting the .tar.gz archive
        with tarfile.open(filename, 'r:gz') as tar:
            for member in tar.getmembers():
                adjusted_path = os.path.join(destination_path, member.name)
                try:
                    tar.extract(member, path=destination_path)
                except Exception as e:
                    print(f"Error extracting to '{adjusted_path}': {str(e)}")

        print("\nKobold extraction to /content/koboldcpp/ completed!")
        print("--------------------------\n")
    else:
        print("--------------------------\nSkipping prebuilt kobold, will build manually...")
        thread.start()
        !git clone https://github.com/LostRuins/koboldcpp
        %cd /content/koboldcpp
        !make LLAMA_CUBLAS=1
        print("--------------------------")
else:
    # In case koboldcpp already exists, just start the model download
    thread.start()

# Hosting the cloudflared server
if not os.path.exists("/content/koboldcpp/cloudflared-linux-amd64"):
    os.chdir("/content/koboldcpp")
    print("\n--------------------------\nDownloading cloudflared...\n")
    !wget -c -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
    !chmod +x cloudflared-linux-amd64
!echo > nohup.out
print("Attempting to launch cloudflared server...")
!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &

# Check nohup.out for "protocol=quic" which signifies it launched
print("Checking if the server is up...\n")
while True:
    time.sleep(1)
    with open('nohup.out', 'r') as f:
        if 'connIndex=' in f.read():
            print("--------------------------\nServer up!")
            break

!cat nohup.out
print("--------------------------\n")

thread.join()

if os.path.exists("/content/koboldcpp/callback_url.py"):
    os.remove("/content/koboldcpp/callback_url.py")

if os.path.exists('/content/koboldcpp/model.gguf') and os.path.exists('/content/koboldcpp/lora.bin'):
    !wget -q https://github.com/kalomaze/koboldcpp/raw/alternate_colab/callback_url.py
    print("--------------------------\nAttempting to launch koboldcpp with the downloaded model and lora...")
    print("--------------------------\n")
    if Smart_Context:
      !python koboldcpp.py model.gguf --lora lora.bin --smartcontext --threads 2 --stream --usecublas 0 normal mmq --context $Context --ropeconfig 1.0 10000 --gpulayers $Layers --hordeconfig concedo --onready "/content/koboldcpp/callback_url.py"
    else:
      !python koboldcpp.py model.gguf --lora lora.bin --threads 2 --stream --usecublas 0 normal mmq --context $Context --ropeconfig 1.0 10000 --gpulayers $Layers --hordeconfig concedo --onready "/content/koboldcpp/callback_url.py"
elif os.path.exists('/content/koboldcpp/model.gguf'):
    !wget -q https://github.com/kalomaze/koboldcpp/raw/alternate_colab/callback_url.py
    print("--------------------------\nAttempting to launch koboldcpp with the downloaded model...")
    print("--------------------------\n")
    if Smart_Context:
      !python koboldcpp.py model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $Context --quiet --remotetunnel
    else:
      !python koboldcpp.py model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $Context --quiet --remotetunnel
else:
    print("Failed to download the GGUF model or LoRA. Please retry.")

# Quick How-To Guide

---
## Step 1. Keeping Google Colab Running
---

Google Colab has a tendency to timeout after a period of inactivity. If you want to ensure your session doesn't timeout abruptly, you can use the following widget.

### Starting the Widget for Audio Player:

> <img src="https://cdn.discordapp.com/attachments/945486970883285045/1150363694191104112/image.png" width="50%"/>

### How the Widget Looks When Playing:

> <img src="https://cdn.discordapp.com/attachments/945486970883285045/1150363653997076540/image.png" width="50%"/>

Follow the visual cues in the images to start the widget and ensure that the notebook remains active.

---
## Step 2. Decide your Model
---

Pick a model and the quantization from the dropdowns, then run the cell like how you did earlier.

### Select your Model and Quantization:

> <img src="https://cdn.discordapp.com/attachments/945486970883285045/1150370141557764106/image.png" width="40%"/>

Alternatively, you can specify a model manually.

### Manual Model Option:

> <img src="https://media.discordapp.net/attachments/945486970883285045/1150370631242764370/image.png" width="75%"/>

5_K_M 13b models should work with 4k (maybe 3k?) context on Colab, since the T4 GPU has ~16GB of VRAM. You can now start the cell, and after 1-3 minutes, it should end with your API link that you can connect to in [SillyTavern](https://docs.sillytavern.app/installation/windows/):

> <img src="https://cdn.discordapp.com/attachments/945486970883285045/1150464795032694875/image.png" width="80%"/>

---
# And there you have it!
### MythoMax (or any 7b / 13b Llama 2 model) in under 2 minutes.
#### (depending on whether or not huggingface downloads are experiencing high traffic)

---

# Credits
### - Made with ~~spite~~ love by kalomaze ❤️ <sub>(also here's the part where I shill my [Patreon](https://www.patreon.com/kalomaze) if you care!)</sub>
### - Koboldcpp is not my software, this is just to make it easy to use on Colab, for research use and beyond. You can find the original GitHub repository for it here: https://github.com/LostRuins/koboldcpp