Spaces:

Tasmay-Tib
/

Tokeniser-py

Sleeping

App Files Files Community

Dark-O-Ether commited on Mar 24

Commit

8764b41

1 Parent(s): 5e4d6f3

Added application file

Browse files

Files changed (2) hide show

app.py +814 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,814 @@

+import streamlit as st
+import streamlit.components.v1 as components
+import pandas as pd
+import random
+import json
+from streamlit_javascript import st_javascript
+import time
+# Set page configuration
+st.set_page_config(
+    page_title="tokeniser-py Demonstration",
+    page_icon="🔣",
+    layout="wide",
+)
+# Custom CSS for better UI
+st.markdown("""
+<style>
+    .main {
+        background-color: #0e1117;
+        color: white;
+    }
+    .stTextInput > div > div > input, .stTextArea > div > div > textarea {
+        background-color: #1e2130;
+        color: white;
+        border: 1px solid #30343e;
+        border-radius: 4px;
+        padding: 10px;
+    }
+    .token-display {
+        margin-top: 20px;
+        padding: 15px;
+        border-radius: 5px;
+        background-color: #1e2130;
+        line-height: 2;
+        overflow-wrap: break-word;
+    }
+    .token {
+        display: inline-block;
+        padding: 2px 4px;
+        margin: 2px;
+        border-radius: 3px;
+        position: relative;
+        cursor: pointer;
+        color: #0e1117 !important;
+        font-weight: 600;
+        text-shadow: 0px 0px 1px rgba(0,0,0,0.2);
+    }
+    .token:hover::after {
+        content: attr(data-id);
+        position: absolute;
+        top: -25px;
+        left: 0;
+        background: #3c4356;
+        color: white;
+        padding: 2px 6px;
+        border-radius: 3px;
+        font-size: 12px;
+        white-space: nowrap;
+        z-index: 100;
+    }
+    .button-container {
+        display: flex;
+        gap: 10px;
+        margin-bottom: 15px;
+    }
+    .stButton button {
+        background-color: #2c313d;
+        border: none;
+        color: white;
+    }
+    .stButton button:hover {
+        background-color: #3c4356;
+    }
+    .info-box {
+        margin-top: 20px;
+        padding: 20px;
+        border-radius: 5px;
+        background-color: #1e2130;
+        font-size: 14px;
+        line-height: 1.6;
+    }
+    .quote {
+        border-left: 4px solid #00ba7c;
+        padding-left: 10px;
+        margin: 10px 0;
+        color: #e0e0e0;
+    }
+    .highlight {
+        background-color: rgba(0, 186, 124, 0.15);
+        padding: 2px 4px;
+        border-radius: 3px;
+        font-weight: 500;
+    }
+    .comparison-table {
+        background-color: #262b38;
+        padding: 15px;
+        border-radius: 5px;
+        margin: 15px 0;
+    }
+    .section-title {
+        font-weight: 600;
+        margin-top: 15px;
+        margin-bottom: 8px;
+        color: #00ba7c;
+    }
+    .stRadio [role=radiogroup] {
+        background-color: #1e2130;
+        padding: 5px;
+        border-radius: 5px;
+    }
+    .header-container {
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+        padding: 10px 0;
+        margin-top: -80px;
+    }
+    .stats-container {
+        display: flex;
+        gap: 20px;
+        padding: 10px;
+        background-color: #1e2130;
+        border-radius: 5px;
+        margin-bottom: 20px;
+    }
+    .stat-box {
+        padding: 10px;
+    }
+    .stat-label {
+        font-size: 0.9em;
+        color: #aaa;
+    }
+    .stat-value {
+        font-size: 1.5em;
+        font-weight: bold;
+    }
+    a {
+        color: #00ba7c !important;
+        text-decoration: none;
+    }
+    a:hover {
+        text-decoration: underline;
+    }
+    .monospace {
+        font-family: monospace;
+    }
+    .note-box {
+        background-color: rgba(255, 204, 0, 0.1);
+        border-left: 3px solid rgba(255, 204, 0, 0.7);
+        padding: 10px 15px;
+        margin: 10px 0;
+        border-radius: 0 5px 5px 0;
+    }
+    .buttons-row {
+        display: flex;
+        gap: 10px;
+    }
+    /* Enhanced bullet points styling */
+    .bullet-point {
+        display: flex;
+        align-items: baseline;
+        margin: 8px 0;
+        padding: 4px 0;
+    }
+    .bullet-point-icon {
+        display: inline-flex;
+        align-items: center;
+        justify-content: center;
+        min-width: 24px;
+        height: 24px;
+        background-color: rgba(0, 186, 124, 0.2);
+        color: #00ba7c;
+        border-radius: 50%;
+        margin-right: 10px;
+        font-weight: bold;
+    }
+    .secondary-bullet {
+        background-color: rgba(0, 186, 124, 0.1);
+    }
+    .comparison-item {
+        display: flex;
+        align-items: baseline;
+        margin: 10px 0;
+        padding: 6px 0;
+    }
+    .comparison-icon {
+        display: inline-flex;
+        align-items: center;
+        justify-content: center;
+        min-width: 28px;
+        height: 28px;
+        background-color: rgba(0, 186, 124, 0.25);
+        color: #00ba7c;
+        border-radius: 50%;
+        margin-right: 12px;
+        font-weight: bold;
+    }
+    .comparison-text {
+        flex: 1;
+    }
+    .learn-more-section {
+        background-color: #1e2130;
+        border-radius: 5px;
+        padding: 20px;
+    }
+    .icon-wrapper {
+        display: inline-flex;
+        align-items: center;
+        justify-content: center;
+    }
+    .colored-icon {
+        display: inline-block;
+        color: #00ba7c;
+        font-size: 1.4em;
+        margin-right: 10px;
+    }
+    .library-feature {
+        display: flex;
+        align-items: baseline;
+        margin: 10px 0;
+    }
+    .feature-dot {
+        min-width: 18px;
+        height: 18px;
+        background-color: rgba(0, 186, 124, 0.2);
+        border-radius: 50%;
+        margin-right: 10px;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+    }
+    .feature-text {
+        flex: 1;
+    }
+    .sub-feature {
+        display: flex;
+        padding-left: 30px;
+        margin: 8px 0;
+        align-items: baseline;
+    }
+    .sub-feature-dot {
+        min-width: 12px;
+        height: 12px;
+        background-color: rgba(0, 186, 124, 0.1);
+        border-radius: 50%;
+        margin-right: 10px;
+    }
+    .code-block {
+        background-color: #0e1117;
+        padding: 15px;
+        border-radius: 5px;
+        font-family: 'Courier New', monospace;
+        margin: 15px 0;
+        color: #e0e0e0;
+        border-left: 3px solid #00ba7c;
+    }
+    .code-line {
+        padding: 2px 0;
+        display: block;
+    }
+    .code-import {
+        color: #ff79c6;
+    }
+    .code-class {
+        color: #8be9fd;
+    }
+    .code-function {
+        color: #50fa7b;
+    }
+    .code-var {
+        color: #f1fa8c;
+    }
+    .code-string {
+        color: #f1fa8c;
+    }
+    .code-comment {
+        color: #6272a4;
+    }
+    .link-top-a{
+        color: rgb(72, 140, 255) !important;
+        font-size: 18px;
+    }
+    .link-top{
+        color: rgb(180, 220, 255) !important;
+        font-size: 18px;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Header with logo and title
+st.markdown("""
+<div class="header-container">
+    <div>
+        <h1>tokeniser-py 🔣</h1>
+        <a href = "https://github.com/Tasmay-Tibrewal/tokeniser-py" class="link-top-a" style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">Library GitHub</span></a>
+        <p class="link-top" style="display: inline;"> | </p>
+        <a href = "https://huggingface.co/datasets/Tasmay-Tib/Tokeniser" class="link-top-a"style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">HF Dataset</span></a>
+        <p class="link-top" style="display: inline;"> | </p>
+        <a href = "https://github.com/Tasmay-Tibrewal/Tokeniser" class="link-top-a"style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">GitHub Dataset (chunked)</span></a>
+        <p class="link-top" style="display: inline;"> | </p>
+        <a href = "https://github.com/Tasmay-Tibrewal/Tokeniser-imp" class="link-top-a"style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">GitHub Imp Files</span></a>
+        <p class="link-top" style="display: inline;"> | </p>
+        <a href = "https://pypi.org/project/tokeniser-py/" class="link-top-a"style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">PyPI Package</span></a>
+        <p></p>
+        <p style="font-size: 20px;"><strong>Learn about language model tokenization</strong></p>
+        <p style="font-size: 17px; margin-bottom: 5px;">
+        <span style="background-color:rgba(154, 187, 255,0.4); padding:2px 4px; border-radius:3px;">tokeniser-py's</span> custom tokenizer processes text using tokens, which are common sequences of characters found in a set of text. The model learns to understand the statistical relationships
+        between these tokens, and excel at producing the next token in a sequence of tokens. You can use the tool below to understand how a piece of text might be tokenized by a language model, and the total count of tokens in that piece of text.
+        </p>
+    </div>
+</div>
+""", unsafe_allow_html=True)
+# Initialize tokenizer
+@st.cache_resource
+def load_tokenizer(ln="1b", token_ordered=False):
+    try:
+        from tokeniser import Tokeniser
+        # Pass parameters based on selection
+        return Tokeniser(ln=ln, token_ordered=token_ordered)
+    except Exception as e:
+        st.error(f"Error loading tokenizer: {e}")
+        return None
+# Information about tokenization
+# st.markdown("""
+# """)
+# st.markdown("")
+# st.markdown("")
+st.markdown("###### Model")
+# Create tabs for different models
+model_version = st.radio(
+    "",
+    ["Default (1b model unordered)", "1b model ordered", "0.5b model unordered", "0.5b model ordered"],
+    horizontal=True
+)
+# Map selected model version to parameters
+if model_version == "Default (1b model unordered)":
+    ln_param = "1b"
+    ordered_param = False
+elif model_version == "1b model ordered":
+    ln_param = "1b"
+    ordered_param = True
+elif model_version == "0.5b model unordered":
+    ln_param = "0.5b"
+    ordered_param = False
+else:
+    ln_param = "0.5b"
+    ordered_param = True
+# Load tokenizer with selected parameters
+tokenizer = load_tokenizer(ln=ln_param, token_ordered=ordered_param)
+# Function to generate consistent pastel colors for tokens
+@st.cache_data
+def get_token_colors(tokens):
+    # Use hash of token to get consistent colors
+    colors = {}
+    for token in set(tokens):
+        # Generate a pastel color based on the hash of the token
+        hash_val = hash(token) % 360
+        colors[token] = f"hsl({hash_val}, 80%, 75%)"
+    return colors
+# Function to display tokens with colors and hover effects
+def display_colored_tokens(tokens, token_ids, token_colors):
+    html = ""
+    for i, (token, token_id) in enumerate(zip(tokens, token_ids)):
+        # Handle special characters for display
+        if token == '\n':
+            display_token = '\\n'
+        elif token == '\t':
+            display_token = '\\t'
+        else:
+            display_token = token.replace("<", "&lt;").replace(">", "&gt;").replace(" ", "&nbsp;")
+        html += f'<span class="token" style="background-color: {token_colors[token]};" data-id="{token_id}">{display_token}</span>'
+    return html
+# Function to display token IDs
+def display_token_ids(token_ids):
+    return f'<div class="monospace">{json.dumps(token_ids)}</div>'
+# Initialize session state for text input if not exists
+if 'text_input' not in st.session_state:
+    st.session_state.text_input = "Hi I am Tasmay, I am a third year undergraduate at IIT Kharagpur and this is my tokeniser. Please enter your text in this box"
+    st.session_state.text_ind = 0
+    print(st.session_state.text_ind)
+st.markdown("###### Enter text to tokenize")
+# Text input area
+text_input = st.text_area(
+    "",
+    st.session_state.text_input,
+    height=150,
+    placeholder="Please enter the text to tokenise",
+    # on_change=handle_text_change,
+)
+def clear_text():
+    st.session_state.text_input = ""
+def show_example():
+    examples = [
+        "Hi I am Tasmay, I am a third year undergraduate at IIT Kharagpur and this is my tokeniser. Please enter your text in this box",
+        "Wop, wop, wop, wop, wop, I'ma do my stuff",
+        "I got loyalty, got royalty inside my DNA",
+        "Sit down, be humble",
+        "We gon' be alright"
+    ]
+    st.session_state.text_ind = (st.session_state.text_ind + 1) % len(examples)
+    st.session_state.text_input = examples[st.session_state.text_ind]
+# Add CSS for fixed-width buttons that wrap to new line
+st.markdown("""
+<style>
+    div[data-testid="stHorizontalBlock"] {
+        flex-wrap: wrap;
+        gap: 10px;
+        margin-top: -15px;
+        padding-top: 0px;
+        margin-bottom: -15px;
+    }
+    div[data-testid="stHorizontalBlock"] > div {
+        flex: 0 0 auto !important;
+        width: auto !important;
+        min-width: initial !important;
+    }
+    div[data-testid="stHorizontalBlock"] button {
+        width: 80px;  /* Fixed width for "Clear" button */
+        margin-top: 0px;
+    }
+    div[data-testid="stHorizontalBlock"] div:nth-child(2) button {
+        margin-top: 0px;
+        width: 150px;  /* Fixed width for "Show example" button */
+    }
+</style>
+""", unsafe_allow_html=True)
+# Create a horizontal block for buttons
+button_container = st.container()
+with button_container:
+    cols = st.columns([1, 1, 10])
+    with cols[0]:
+        st.button("Clear", on_click=clear_text)
+    with cols[1]:
+        st.button("Show example", on_click=show_example)
+# Process the text for tokenization
+if tokenizer:
+    try:
+        tokens, count = tokenizer.tokenise(text_input)
+        token_ids = tokenizer.token_ids(tokens)
+        num_tokens = len(tokens)
+        num_chars = len(text_input)
+        chars_per_token = num_chars / num_tokens if num_tokens > 0 else 0
+    except Exception as e:
+        st.error(f"Error tokenizing text: {e}")
+        tokens = []
+        token_ids = []
+        num_tokens = 0
+        num_chars = 0
+        chars_per_token = 0
+# Inject custom CSS
+st.markdown(
+    """
+    <style>
+    div[role="radiogroup"] > label {
+        height: 40px !important;
+        padding-left: 10px;
+        display: flex;
+        align-items: center;
+    }
+    div[role="radiogroup"] {
+        margin-top: -30px;
+        margin-bottom: 0px;
+    }
+    div[data-testid="stTextArea"] {
+        margin-top: -30px;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+# st.markdown("###### View")
+# Create view toggle
+view_option = st.radio(
+    "",
+    ["Text", "Token IDs"],
+    horizontal=True
+)
+# Get token colors if we have tokens
+token_colors = get_token_colors(tokens) if tokens else {}
+# Always display the token display, even if empty
+if view_option == "Text":
+    if tokens:
+        st.markdown(f'<div class="token-display" style="margin-top: -25px;">{display_colored_tokens(tokens, token_ids, token_colors)}</div>', unsafe_allow_html=True)
+    else:
+        st.markdown(f'<div class="token-display" style="margin-top: -25px;">No tokens to display</div>', unsafe_allow_html=True)
+else:
+    if token_ids:
+        st.markdown(f'<div class="token-display" style="margin-top: -25px;">{display_token_ids(token_ids)}</div>', unsafe_allow_html=True)
+    else:
+        st.markdown(f'<div class="token-display" style="margin-top: -25px;">No token IDs to display</div>', unsafe_allow_html=True)
+# Always display the stats container, even if empty
+st.markdown("""
+<div class="stats-container" style="margin-top: -10px; margin-bottom: 10px;">
+    <div class="stat-box">
+        <div class="stat-label">Tokens</div>
+        <div class="stat-value">{}</div>
+    </div>
+    <div class="stat-box">
+        <div class="stat-label">Characters</div>
+        <div class="stat-value">{}</div>
+    </div>
+    <div class="stat-box">
+        <div class="stat-label">Chars per token</div>
+        <div class="stat-value">{:.2f}</div>
+    </div>
+</div>
+""".format(num_tokens, num_chars, chars_per_token),
+unsafe_allow_html=True)
+# Information box split into multiple markdown elements for better rendering
+# st.markdown("<div class='info-box'>", unsafe_allow_html=True)
+# Section 1: Tokenization Efficiency
+st.markdown("---")
+st.markdown("<h3 style='color:#00ba7c; margin-top:10px;'>Tokenization Efficiency</h3>", unsafe_allow_html=True)
+# Quote block
+st.markdown("""
+<div style="border-left: 4px solid #00ba7c; padding-left: 15px; margin: 15px 0; color: #e0e0e0;">
+    A helpful rule of thumb is that one token generally corresponds to ~4 characters of text for
+    common English text. This translates to roughly ¾ of a word (so 100 tokens ~= 75 words).
+    <div style="font-style: italic; color: #aaa; margin-top: 5px;">— OpenAI</div>
+</div>
+""", unsafe_allow_html=True)
+# Section 2: Our Analysis
+st.markdown("<h3 style='color:#00ba7c; margin-top:20px;'>Our Analysis</h3>", unsafe_allow_html=True)
+st.markdown("<p>We've conducted a thorough analysis of token efficiency of our tokeniser against different tokenizers:</p>", unsafe_allow_html=True)
+# Analysis points with enhanced styling
+st.markdown("""
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>The <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px;">GPT-2 tokenizer</span> corresponds to approximately <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px;">3.9 characters per token</span></div>
+</div>
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>English text corpus typically has average word lengths ranging from <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px;">4.7 to 5.1 characters</span>, which was observed to be <span style="background-color:rgba(0,186,124,0.4); padding:2px 4px; border-radius:3px;">4.73-4.79 in our dataset</span></div>
+</div>
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>Thus for our dataset, traditional tokenizers convert to roughly <span style="background-color:rgba(0,186,124,0.4); padding:2px 4px; border-radius:3px;">⁴⁄₅ of a word</span> (100 tokens ≈ 80 words)</div>
+</div>
+""", unsafe_allow_html=True)
+# Section 3: tokeniser-py Efficiency
+st.markdown("<h3 style='color:#00ba7c; margin-top:20px;'><u>tokeniser-py</u> efficiency</h3>", unsafe_allow_html=True)
+st.markdown("<p>Our tokenizer demonstrates different characteristics:</p>", unsafe_allow_html=True)
+# Efficiency points with enhanced styling
+st.markdown("""
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>Average token size of <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px;">~2.52 characters**</span> across all token types</div>
+</div>
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>For alphanumeric tokens only: <span style="background-color:rgba(0,186,124,0.4); padding:2px 4px; border-radius:3px;">~3.97 characters per token</span></div>
+</div>
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>This translates to approximately <span style="background-color:rgba(0,186,124,0.4); padding:2px 4px; border-radius:3px;">⁹⁄₁₀ of a word</span> (100 tokens ≈ 90 words)</div>
+</div>
+""", unsafe_allow_html=True)
+# Section 4: Real-world Comparison with completely redesigned styling
+st.markdown("""
+<div style="background-color:#262b38; padding:20px; border-radius:5px; margin:25px 0;">
+    <h3 style="color:#00ba7c; margin-top:0px; margin-bottom:15px; font-size:1.3em;">Real-world Comparison</h3>
+    <p style="margin-bottom:15px;">We tested a 28-page blog post across different tokenizers:</p>
+    <div class="comparison-item">
+        <div class="comparison-icon">1</div>
+        <div class="comparison-text">
+            <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">GPT-4o/GPT-4:</span>
+            <span style="font-size:1.1em; margin-left:8px;">~10.4k tokens</span>
+        </div>
+    </div>
+    <div class="comparison-item">
+        <div class="comparison-icon">2</div>
+        <div class="comparison-text">
+            <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">GPT-3:</span>
+            <span style="font-size:1.1em; margin-left:8px;">~12.1k tokens</span>
+        </div>
+    </div>
+    <div class="comparison-item">
+        <div class="comparison-icon">3</div>
+        <div class="comparison-text">
+            <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">tokeniser-py:</span>
+            <span style="font-size:1.1em; margin-left:8px;">~18.8k tokens</span>
+            <span style="color:#aaa;">(including ~8.4k space tokens and ~2.6k other special-char based tokens)</span>
+        </div>
+    </div>
+    <div class="comparison-item">
+        <div class="comparison-icon">4</div>
+        <div class="comparison-text">
+            <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">tokeniser-py (alphanumeric only):</span>
+            <span style="font-size:1.1em; margin-left:8px;">~7.8k tokens</span>
+        </div>
+    </div>
+    <div class="comparison-item">
+        <div class="comparison-icon">5</div>
+        <div class="comparison-text">
+            <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">GPT-4/GPT-4o (alphanumeric):</span>
+            <span style="font-size:1.1em; margin-left:8px;">~8k tokens</span>
+        </div>
+    </div>
+</div>
+""", unsafe_allow_html=True)
+# Note box with enhanced styling
+st.markdown("""
+<div style="background-color:rgba(255,204,0,0.1); border-left:3px solid rgba(255,204,0,0.7); padding:15px; margin:20px 0; border-radius:0 5px 5px 0;">
+    <div style="font-size:18px; font-weight:bold; margin-bottom:12px; color:#ffcc00;">Note:</div>
+    <p style="line-height:2.2;"><span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.2); color:#ffcc00;">•</span>
+    <span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">**2.52 characters</span> is the average (adjusted frequency)-weighted token size i.e. we weigh the token size by their true occurences, obtained after adjusting their observed occurences by their super-tokens' occurences.<br>
+    <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">•</span>
+    <span>A super-token of a token say '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">e</span>' is any token which contains '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">e</span>' (like '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">ear</span>', '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">ears</span>', '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">years</span>', etc.). While weighing the token length we find that a smaller tokens have an undue higher weightage due their occurences in super-tokens being added up as well.
+    To adjust this we hierarchially subtract the occurence of a token from its super tokens to get a True frequency.</span><br>
+    <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">•</span>
+    <span>Un-adjusted frequency weighting gives an average size of <span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">~2.2 characters</span> per token, and a raw (un-weighted) average results in <span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">~4.6-4.7 chars</span> per token.</span><br>
+    <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">•</span>
+    <span>Our tokenization strategy separates non-underscore special characters from alphanumeric tokens.</span><br>
+    <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">•</span>
+    <span>We define alphanumeric tokens as any word that doesn't contain special characters (except underscores).</span><br>
+    <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">•</span>
+    <span>For OpenAI's tokens, we considered any token containing at least one alphanumeric character (excluding underscores) as an alphanumeric token.</span><br>
+    <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">•</span>
+    <span>This difference is due to the different special characters handling methodology followed in both tokeniser.</span></p>
+</div>
+""", unsafe_allow_html=True)
+# Section 5: Design Philosophy with enhanced styling
+st.markdown("<h3 style='color:#00ba7c; margin-top:20px;'>Design Philosophy</h3>", unsafe_allow_html=True)
+st.markdown("<p>Our approach prioritizes semantic representation over token count minimization:</p>", unsafe_allow_html=True)
+# Philosophy points with enhanced styling
+st.markdown("""
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>We consciously separate special characters from alphanumeric tokens</div>
+</div>
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>This provides more available alphanumeric tokens in the vocabulary</div>
+</div>
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>While this may increase total token count, it improves semantic representation</div>
+</div>
+<div class="bullet-point">
+    <div class="bullet-point-icon">•</div>
+    <div>Our design philosophy favors representation quality over token count minimization</div>
+</div>
+""", unsafe_allow_html=True)
+# Footer link
+st.markdown("""
+<p style="margin-top:20px;">
+    Need a programmatic interface for tokenizing text? Check out our
+    <a href="https://pypi.org/project/tokeniser-py/">tokeniser-py</a> package for Python.
+</p>
+</div>
+""", unsafe_allow_html=True)
+# Footer with additional information
+st.markdown("---")
+st.markdown("""<h2 style='color:#00ba7c; margin-top:0px;'>About tokeniser-py</h2>
+A high-performance, fully custom tokeniser built from scratch — no BPE, no existing NLP tokenisation scheme.
+This tokeniser is based on a unique algorithm developed independently and trained on over 1 billion tokens
+from the SlimPajama dataset (Val + Test), providing an efficient, interpretable, and extendable tokenisation pipeline.
+<div class="library-feature">
+    <div class="feature-dot">•</div>
+    <div class="feature-text"><strong>Tokeniser built on a vocabulary of 131,072 tokens</strong></div>
+</div>
+<div class="library-feature">
+    <div class="feature-dot">•</div>
+    <div class="feature-text"><strong>Two versions of vocab:</strong> <code>0.5B</code> (Validation-only data) and <code>1B</code> (Validation + Test data)</div>
+</div>
+<div class="library-feature">
+    <div class="feature-dot">•</div>
+    <div class="feature-text"><strong>Token vocab built via a custom algorithm</strong> — no Byte Pair Encoding (BPE)</div>
+</div>
+<div class="library-feature">
+    <div class="feature-dot">•</div>
+    <div class="feature-text"><strong>Lightweight JSON format</strong> for token maps & token count maps</div>
+</div>
+<div class="library-feature">
+    <div class="feature-dot">•</div>
+    <div class="feature-text"><strong>Ready for integration</strong> into any LLM pre-tokenisation pipeline</div>
+</div>
+[GitHub Repository](https://github.com/Tasmay-Tibrewal/tokeniser-py) | [PyPI Package](https://pypi.org/project/tokeniser-py/)
+""", unsafe_allow_html=True)
+import streamlit as st
+# Add explanation of the library in expandable section
+with st.expander("Learn more about tokeniser-py"):
+    st.markdown("""
+    ### 🚀 What This Library Offers
+    - Tokeniser built on a vocabulary of **131,072 tokens**
+    - Two versions of vocab:
+      - `0.5B`: Validation-only data
+      - `1B`: Validation + Test data
+    - Token vocab built via a **custom algorithm** — no Byte Pair Encoding (BPE)
+    - Tokenisation logic includes:
+      - Token lookup from pre-generated token map
+      - Dynamic programming-based segmentation for out-of-vocab tokens
+      - One-hot encoding (NumPy or PyTorch)
+      - Visualisation utilities for tokens and token IDs
+    - Lightweight JSON format for token maps & token count maps
+    - Ready for integration into any LLM pre-tokenisation pipeline
+    """)
+    # Add custom CSS
+    st.markdown("""
+    <style>
+    div.stCodeBlock {
+        background-color: #1a1c24 !important;
+        border-radius: 10px;
+        padding-left: 25px;
+        padding-top: 15px;
+        padding-bottom: 15px;
+    }
+    pre.language-python {
+        background-color: #1a1c24 !important;
+        border-radius: 10px;
+    }
+    .code-header {
+        font-size: 1.5em;
+        font-weight: bold;
+        margin-top: 0em;
+        margin-bottom: 0.5em;
+        display: flex;
+        align-items: center;
+    }
+    .code-block {
+        background-color: #1a1c24;
+        border-radius: 5px;
+        padding: 1em;
+        margin-bottom: 1em;
+        font-family: 'Courier New', monospace;
+        white-space: pre;
+        color: #d4d4d4;
+        overflow-x: auto;
+        line-height: 1.5;
+    }
+    .keyword { color: #c586c0; }
+    .string { color: #CE9178; }
+    .function { color: #4ec9b0; }
+    .parenthesis {color: #ffd700;}
+    .var {color: #8cdcfe;}
+    </style>
+    """, unsafe_allow_html=True)
+    # Code header and block with simpler HTML
+    st.markdown("""
+    <div class="code-header">🛠️ Usage</div>
+    <pre class="code-block"><span class="keyword">from</span> <span class="function">tokeniser</span> <span class="keyword">import</span> <span class="function">Tokeniser</span><br>
+<span class="var">t</span> = <span class="function">Tokeniser</span><span class="parenthesis">()</span><br>
+<span class="var">tokens</span>, <span class="var">count</span> = <span class="var">t</span>.<span class="function">tokenise</span><span class="parenthesis">(</span><span class="string">"Your input text here."</span><span class="parenthesis">)</span><br>
+<span class="var">token_ids</span> = <span class="var">t</span>.<span class="function">token_ids</span><span class="parenthesis">(</span><span class="var">tokens</span><span class="parenthesis">)</span></pre>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    Use `t.one_hot_tokens(token_ids)` for NumPy-based one-hot encoding, or `op='torch'` for PyTorch.
+    ### 📁 Vocab Files
+    - `ordered_tokenizer_1b_val_test_data.json` — Ordered tokens (1B data)
+    - `unordered_tokenizer_1b_val_test_data.json` — Unordered tokens (1B)
+    - `count_tokenizer_1b_val_test_data.json` — Token counts (1B)
+    - Similar structure for 0.5B val-only version
+    """)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+streamlit>=1.27.0
+pandas>=1.5.0
+tokeniser-py