# ========================================================= # KIEMBU ↔ ENGLISH — NRF KENYA TRANSLATION SUITE # ========================================================= import os import gradio as gr import fitz import faiss import re import numpy as np from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from reportlab.lib.pagesizes import A4 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER from reportlab.lib.units import inch from reportlab.pdfbase.cidfonts import UnicodeCIDFont from reportlab.pdfbase import pdfmetrics from PyPDF2 import PdfReader # ============================================ # SECTION 1 — SIMPLE DICTIONARY TRANSLATOR # ============================================ # =========================================== # Kiembu ↔ English Dictionary (Case & Punctuation Insensitive) # =========================================== kiembu_to_english = { # Existing entries "Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come", "Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far", "Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person", "Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man", "Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine", "Maaĩ": "water", "Mwaki": "fire", "Rĩũa": "sun", "Mweri": "moon", "Njata": "star", "ĩthiga": "stone", "Mũtĩ": "tree", "ĩthangũ": "leaf", "Mũri": "root", "ĩkoro": "bark", "MũndũMũrume": "man", "MũndũMũka": "woman", "fafa": "father", "Mami": "mother", "Gũtũ": "ear", "Ritho": "eye", "ĩniũrũ": "nose", "Kanyua": "mouth", "ĩgego": "tooth", "rũrĩmĩ": "tongue", "Njara": "hand", "Kũgũrũ": "foot", "Thakame": "blood", "ĩvĩndĩ": "bone", "Ngothi": "skin", "Nyama": "meat", "Nthamaki": "fish", "Giconi": "bird", "ĩtumbĩ": "egg", "Nvĩa": "horn", "Mũkia": "tail", "ĩvuta": "feather", "Njuĩrĩ": "hair", "Kĩongo": "head", "Ngingo": "neck", "Mũrukuthu": "back", "Ngoro": "heart", "Itema": "liver", "nyua": "drink", "ria": "eat", "mama": "sleep", "kua": "die", "ũka": "come", "ona": "see", "ĩgua": "hear", "menya": "know", "ĩciria": "think", "uga": "say", "ĩmwe": "one", "ĩgarĩ": "two", "ĩthatũ": "three", "ĩnya": "four", "ĩthano": "five", "ithathatu": "six", "mugwanja": "seven", "inyanya": "eight", "kenda": "nine", "ĩkumi": "ten", "nene": "big", "nini": "small", "ndaca": "long", "nguvi": "short", "mbega": "good", "njũku": "bad", "mbĩcuru": "full", "ĩtikĩndu": "empty", "nviũ": "hot", "nvoru": "cold", "ũtukũ": "night", "Mũthenya": "day", "Mbura": "rain", "rũkũngi": "wind", "nthĩ": "earth", "kĩrĩma": "mountain", "rũnjĩ": "river", "ĩria": "lake/sea", "cumbĩ": "salt", "mũthanga": "sand", "ndogo": "smoke", "nyaki": "grass", "njira": "path", "kivaro": "field", "kuraca": "far", "vakuvi": "near", "ava": "here", "varia": "there", "ũũ": "who", "ndwi": "what", "kũ": "where", "rĩ": "when", "atia": "how", "ka": "not", "onthe": "all", "engĩ": "many", "anini": "few", "jerũ": "new", "ngũrũ": "old", "kĩthũrũrũ": "round", "kaũgĩ": "sharp", "ritwa": "name", "tirama": "stand", "ĩkara": "sit", "thiĩ": "walk", "ngaria": "run", "va": "give", "oca": "take", "nyita": "hold", "tiniaa": "cut", "ringa": "hit", "ikia": "throw", "via": "burn", "ĩthambĩra": "swim", "ina": "sing", "katika": "dance", "theka": "laugh", "rĩra": "cry", "rũma": "bite", "mumunya": "suck", "nungira": "smell", "ĩtigĩra": "fear", "wina toro": "sleepy", "mũvũtu": "hungry", "mũnyondu": "thirsty", "ndune": "red", "njerũ": "white", "mbirũ": "black", "ngirini": "green", "yellũ": "yellow", "mbulu": "blue", "matu": "cloud", "maturĩ": "sky", "rũkũngũ": "dust", "mũu": "ashes", "mũkanda": "rope", "kamũti": "stick", "kaviũ": "knife", "ũta": "bow", "mũgwi": "arrow", "itumũ": "spear", "gitegithamaki": "fishhook", "neti": "net", "ĩtaru": "canoe", "mũrango": "door", "ĩtara": "roof", "nthĩ": "floor", "Mũgeka": "mat", "kĩtanda": "bed", "mũrengeti": "blanket", "nyũngũ": "pot", "kanya": "calabash", "gĩkapũ": "basket", "nduramu": "drum", "rwĩmbo": "song", "rũgano": "story", "thakania": "play", "mũrata": "friend", "nthũ": "enemy", "civũ": "chief", "mũkũrũ": "elder", "ĩria": "milk", "ngombe": "cow/cattle", "mbũri": "goat", "ngondu": "sheep", "ngũkũ": "chicken", "ngamĩra": "camel", "nvuda": "donkey", "ndegwa": "ox", "mbegũ": "seed", "ketha": "harvest", "ũma": "hoe", "ĩthanwa": "axe", "mũro": "digging stick", "Mũtumi ciodo": "weaver", "Mwaki nyũngũ": "potter", "mũturi": "blacksmith", "mũgwĩmi": "hunter", "mũrĩthi": "herdsman", "mũteginthamaki": "fisherman", "thoko": "market", "kwendia": "trade", "cenjania": "barter", "mathaa": "time", "mavinda": "season", "ĩvinda rĩa riũa": "dry season", "ivinda ria mbura": "rainy season", "rĩũra": "famine", "thayũ": "peace", "mbara": "war", "gũrana": "marriage", "mũviki": "bride", "mũvikania": "groom", "ĩrua": "initiation", "kũrua": "circumcision", "kĩkuũ": "death", "ngoma": "spirit", "ngomi": "ancestor", "mũgĩmbĩ": "finger millet", "mũkombi": "pearl millet", "mwere": "bulrush millet", "mũvia": "sorghum", "mbembe": "maize", "minji": "cowpea", "ndengũ": "green gram", "njavĩ": "pigeon pea", "ndũma": "arrowroot/taro", "mwanga": "cassava", "gĩkũa": "yam", "ngwacĩ": "sweet potato", "ĩrenge": "pumpkin", "sukuma": "kale", "terere": "amaranth", "Thageti": "spider plant", "kaũrũra": "pumpkin leaves", "kunde": "cowpea leaves", "Mabuyu": "baobab fruit", "nthithi": "tamarind", "mbera": "guava", "matimoko": "custard apple/soursop", "macuca": "loquat", "kĩgwa": "sugarcane", "njahĩ": "sesame", "Marũrũ": "sunflower", "mbiringanya": "eggplant", "nyanya": "tomato", "gĩtũngũrũ": "onion", "kĩtũngũrũ saumu": "garlic", "tangauthi": "ginger", "murende": "turmeric", "nduru": "chili", "mboga": "cabbage", "karati": "carrot", "njukĩ": "bee", "ukĩ": "honey", "mwatu": "beehive", "mabaki": "wax", "maguta": "butter", "kĩrimũ": "cream", "alenya": "ghee", "ĩria ra kũgandithua": "sour milk" } # --- Helper: Normalize user input --- def normalize(text): """ Cleans text for case-insensitive and punctuation-insensitive lookup. Removes punctuation (.,?!-), converts to lowercase, trims spaces. """ text = text.lower().strip() text = re.sub(r"[.,?!-]", "", text) # remove punctuation return text # --- Prepare lookup tables (in lowercase) --- kiembu_lower = {normalize(k): v for k, v in kiembu_to_english.items()} english_lower = {normalize(v): k for k, v in kiembu_to_english.items()} # --- Translation Function --- def translate_word(word, direction): """Translate a word between Kiembu and English, ignoring case & punctuation.""" cleaned = normalize(word) if direction == "Kiembu → English": return kiembu_lower.get(cleaned, "Not found in dictionary") elif direction == "English → Kiembu": return english_lower.get(cleaned, "Not found in dictionary") else: return "Invalid translation direction. Use 'Kiembu → English' or 'English → Kiembu'." # ============================================ # SECTION 2 — PDF TRANSLATION (Transformer + PDF) # ============================================ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-sw") # placeholder def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" return text.strip() def translate_text(text): chunks = text.split(". ") translated = [] for chunk in chunks: if chunk.strip(): try: tr = translator(chunk.strip())[0]["translation_text"] translated.append(tr) except Exception: translated.append(chunk) return ". ".join(translated) def create_pdf(translated_text, output_path="translated_output.pdf"): pdfmetrics.registerFont(UnicodeCIDFont('HeiseiKakuGo-W5')) doc = SimpleDocTemplate(output_path, pagesize=A4, rightMargin=60, leftMargin=60, topMargin=72, bottomMargin=72) styles = getSampleStyleSheet() title_style = ParagraphStyle(name='TitleStyle', parent=styles['Heading1'], alignment=TA_CENTER, fontName='HeiseiKakuGo-W5', fontSize=16, spaceAfter=20) body_style = ParagraphStyle(name='BodyStyle', parent=styles['Normal'], alignment=TA_JUSTIFY, fontName='HeiseiKakuGo-W5', fontSize=12, leading=16) story = [Paragraph("Translated Document — English → Kiembu", title_style), Spacer(1, 0.3 * inch)] for para in translated_text.split("\n"): if para.strip(): story.append(Paragraph(para.strip(), body_style)) story.append(Spacer(1, 0.2 * inch)) doc.build(story) return output_path def translate_pdf_to_kiembu(pdf_file): text = extract_text_from_pdf(pdf_file.name) if not text: return None, "No readable text found in the uploaded PDF." translated_text = translate_text(text) output_pdf_path = create_pdf(translated_text) return output_pdf_path, "Translation complete! Download below." # ============================================ # SECTION 3 — NRF LLM MODEL PDF CHAT # ============================================ embed_model = SentenceTransformer("all-MiniLM-L6-v2") model_name = "google/gemma-2b-it" hf_token = os.getenv("NRF_LLM_TOKEN") tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token) model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", use_auth_token=hf_token) generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200) chunks, index, pdf_loaded = [], None, False def extract_pdf_text(pdf_file): doc = fitz.open(pdf_file.name) text = "" for page in doc: text += page.get_text() return text def chunk_text(text, chunk_size=500, overlap=100): words = text.split() chunk_list = [] start = 0 while start < len(words): end = min(start + chunk_size, len(words)) chunk_list.append(" ".join(words[start:end])) start += chunk_size - overlap return chunk_list def embed_chunks(chunks_list): embeddings = embed_model.encode(chunks_list) idx = faiss.IndexFlatL2(embeddings.shape[1]) idx.add(np.array(embeddings)) return idx def load_pdf_and_prepare(pdf_file): global chunks, index, pdf_loaded try: text = extract_pdf_text(pdf_file) chunks = chunk_text(text) index = embed_chunks(chunks) pdf_loaded = True return "✅ PDF uploaded and processed successfully." except Exception as e: return f"❌ Error: {str(e)}" def delete_pdf(): global chunks, index, pdf_loaded chunks, index, pdf_loaded = [], None, False return "🗑️ PDF cleared. Ready for new upload." def query_pdf(question, top_k=3): if not pdf_loaded: return "⚠️ Please upload and process a PDF first." question_embedding = embed_model.encode([question]) D, I = index.search(np.array(question_embedding), top_k) context = "\n".join([chunks[i] for i in I[0]]) prompt = f"Answer the question using the context:\n\n{context}\n\nQuestion: {question}\nAnswer:" response = generator(prompt)[0]["generated_text"] return response.split("Answer:")[-1].strip() # ============================================ # SECTION 4 — ENHANCED GRADIO UI # ============================================ def build_app(): custom_css = """ body { background: #f5f5f5; margin: 0; padding: 0; overflow: auto; } .gradio-container { display: flex; flex-direction: column; align-items: center; justify-content: flex-start; min-height: 100vh; padding: 30px 15px; box-sizing: border-box; border: 2px solid #ccc; border-radius: 16px; box-shadow: 0 4px 16px rgba(0,0,0,0.1); background: white; max-width: 900px; margin: 20px auto; overflow-y: auto; } ::-webkit-scrollbar { width: 10px; } ::-webkit-scrollbar-track { background: #eee; border-radius: 10px; } ::-webkit-scrollbar-thumb { background: #aaa; border-radius: 10px; } ::-webkit-scrollbar-thumb:hover { background: #777; } textarea, input[type="text"], .gr-textbox, .gr-input { border: 2px solid #bbb !important; border-radius: 10px !important; padding: 8px !important; box-shadow: inset 0 2px 4px rgba(0,0,0,0.05); transition: border-color 0.2s ease, box-shadow 0.2s ease; } textarea:focus, input[type="text"]:focus { border-color: #0078D7 !important; box-shadow: 0 0 5px rgba(0,120,215,0.3) !important; outline: none; } button, .gr-button { border-radius: 10px !important; padding: 10px 16px !important; font-weight: 600 !important; } """ with gr.Blocks( title="Kiembu ↔ English — NRF Kenya Project", css=custom_css, theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray") ) as app: gr.Markdown("""

Kiembu ↔ English Translation Suite

Funded by NRF Kenya — Creating LLMs that Understand Native Languages


""") with gr.Tabs(): # ----------------------------- # TAB 1: DICTIONARY TRANSLATOR # ----------------------------- with gr.TabItem("Dictionary Translator"): gr.Markdown(""" ### Quick Word Translation — **Kiembu ↔ English** Enter a single word or short phrase and get its translation instantly. **Sample Words:** | Kiembu | English | |:--|:--| | Uvoro | how are you | | Ri? | When? | | Ku? | Where? | | Uka | come | """) inp = gr.Textbox(label="Enter Word", placeholder="e.g. 'Uvoro' or 'how are you'", lines=1) dir_sel = gr.Radio( ["Kiembu → English", "English → Kiembu"], value="Kiembu → English", label="Select Direction" ) out = gr.Textbox(label="Translation Result") gr.Button("Translate").click(translate_word, [inp, dir_sel], out) # ----------------------------- # TAB 2: PDF TRANSLATION # ----------------------------- with gr.TabItem("PDF Translation"): gr.Markdown(""" ### **English → Kiembu PDF Translator** Upload an **English PDF document** (e.g., ID form, hospital form, passport form) and get a **translated PDF in Kiembu** for download. """) pdf_input = gr.File(label="Upload English PDF", file_types=[".pdf"]) translate_btn = gr.Button("Translate to Kiembu") output_file = gr.File(label="Download Translated PDF") status = gr.Textbox(label="Status", interactive=False) translate_btn.click(translate_pdf_to_kiembu, inputs=[pdf_input], outputs=[output_file, status]) # ----------------------------- # TAB 3: NRF LLM MODEL Q&A # ----------------------------- with gr.TabItem("PDF Chat (NRF LLM Model)"): gr.Markdown(""" ### **Interactive PDF Chat — NRF LLM Model** Upload any **informative PDF** (e.g., government report, history book, or manual) and ask natural-language questions to understand its content better. **Examples:** - "What does this document say about birth registration?" - "Summarize Chapter 2." """) pdf = gr.File(label="Upload PDF Document") status = gr.Textbox(label="Status") gr.Button("Process PDF").click(load_pdf_and_prepare, pdf, status) gr.Button("Clear PDF").click(delete_pdf, None, status) q = gr.Textbox(lines=2, label="Ask a Question", placeholder="e.g. 'Summarize the introduction section.'") ans = gr.Textbox(lines=6, label="Answer") gr.Button("Query PDF").click(query_pdf, q, ans) # ----------------------------- # TAB 4: ABOUT # ----------------------------- with gr.TabItem("About"): gr.Markdown(""" ### About the Project The **NRF Kenya Project** on *Creating LLMs that Understand Native Languages* aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools. - **Languages Supported:** Kiembu ↔ English - **Core Engine:** NRF LLM Model under development - **:** Principal Investigator: Prof Lucy Kawira – Chuka University - **Developed by: Technical Team: Coordinator- Casam Njagi – Chuka University - **Funding Agency:** National Research Fund (NRF), Kenya - **Objective:** Foster inclusion of native languages in AI-driven communication. """) gr.Markdown("""
© 2025 National Research Fund (NRF) Kenya — All Rights Reserved
""") return app demo = build_app() demo.launch()