Dual-Stage-Toxic-Moderation

Sleeping

App Files Files Community

NightPrince commited on Jul 10

Commit

684a742

verified ·

1 Parent(s): a9d338c

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -36

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
-import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 from peft import PeftModel
-# ✅ Load model and tokenizer
-base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=9)
-model = PeftModel.from_pretrained(base_model, "NightPrince/peft-distilbert-toxic-classifier")
-tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-# ✅ Label mapping
 id2label = {
     0: "Child Sexual Exploitation",
     1: "Elections",
@@ -19,44 +19,97 @@ id2label = {
     7: "Violent Crimes",
     8: "unsafe"
 }
-# ✅ Pipeline for easy inference
-pipe = pipeline(
-    "text-classification",
-    model=model,
-    tokenizer=tokenizer,
-    return_all_scores=True
-)
-# ✅ Define prediction function
-def classify_toxicity(query, image_description):
-    combined_text = query + " [SEP] " + image_description
-    preds = pipe(combined_text)[0]  # Get scores for all classes
     preds_sorted = sorted(preds, key=lambda x: x['score'], reverse=True)
     top_label = preds_sorted[0]['label']
     top_score = preds_sorted[0]['score']
-    # Map label ID back to human-readable label
     label_id = int(top_label.split("_")[-1]) if "_" in top_label else int(top_label)
     final_label = id2label.get(label_id, "Unknown")
-    # Display all class scores (optional)
     scores_table = "\n".join(
         [f"{id2label[int(item['label'].split('_')[-1])]}: {round(item['score']*100, 2)}%" for item in preds]
     )
-    return f"Top Prediction: {final_label} ({round(top_score*100, 2)}%)\n\nFull Class Scores:\n{scores_table}"
-# ✅ Gradio UI
-iface = gr.Interface(
-    fn=classify_toxicity,
-    inputs=[
-        gr.Textbox(label="User Query"),
-        gr.Textbox(label="Image Description"),
-    ],
-    outputs=gr.Textbox(label="Toxicity Prediction"),
-    title="Toxic Category Classifier (DistilBERT + LoRA)",
-    description="Enter a user query and image description. The model will classify into one of the 9 toxic categories."
-)
-iface.launch()

+import streamlit as st
+from transformers import (
+    AutoTokenizer, AutoModelForSequenceClassification,
+    pipeline, BlipProcessor, BlipForConditionalGeneration
+)
 from peft import PeftModel
+from PIL import Image
+import requests
+# 1️⃣ Setup label mapping
 id2label = {
     0: "Child Sexual Exploitation",
     1: "Elections",
     7: "Violent Crimes",
     8: "unsafe"
 }
+# 2️⃣ Load BLIP captioning model
+@st.cache_resource
+def load_caption_model():
+    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+    return processor, model
+def caption_image(img):
+    processor, model = load_caption_model()
+    inputs = processor(images=img, return_tensors="pt")
+    out = model.generate(**inputs)
+    caption = processor.decode(out[0], skip_special_tokens=True)
+    return caption
+# 3️⃣ Load your DistilBERT+LoRA classifier
+@st.cache_resource
+def load_toxic_classifier():
+    base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=9)
+    model = PeftModel.from_pretrained(base_model, "NightPrince/peft-distilbert-toxic-classifier")
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
+    return pipe
+def classify_toxicity(text_input, caption):
+    pipe = load_toxic_classifier()
+    full_input = text_input + " [SEP] " + caption
+    preds = pipe(full_input)[0]
     preds_sorted = sorted(preds, key=lambda x: x['score'], reverse=True)
     top_label = preds_sorted[0]['label']
     top_score = preds_sorted[0]['score']
     label_id = int(top_label.split("_")[-1]) if "_" in top_label else int(top_label)
     final_label = id2label.get(label_id, "Unknown")
     scores_table = "\n".join(
         [f"{id2label[int(item['label'].split('_')[-1])]}: {round(item['score']*100, 2)}%" for item in preds]
     )
+    return final_label, top_score, scores_table
+# 4️⃣ Llama Guard (real API)
+def llama_guard_filter(text):
+    url = "https://openrouter.ai/api/v1/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {st.secrets['OPENROUTER_API_KEY']}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": "meta-llama/llama-guard-3-8b",
+        "messages": [
+            {"role": "system", "content": "You are a safety classifier. Return only 'safe' or 'unsafe'."},
+            {"role": "user", "content": text}
+        ]
+    }
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        result = response.json()["choices"][0]["message"]["content"].strip().lower()
+        return result
+    except Exception as e:
+        print("Llama Guard API Error:", e)
+        return "safe"  # Fallback if error
+# 🌐 Streamlit UI
+st.set_page_config(page_title="Toxic Moderation System", layout="centered")
+st.title("🛡️ Dual-Stage Toxic Moderation")
+st.markdown("Moderate text and images using **Llama Guard** + **DistilBERT-LoRA**.\n\n- Stage 1: Hard Safety Filter (Llama Guard)\n- Stage 2: Fine Toxic Classifier (LoRA DistilBERT)")
+text_input = st.text_area("✏️ Enter a text message", height=150)
+uploaded_image = st.file_uploader("📷 Upload an image (optional)", type=["jpg", "jpeg", "png"])
+image_caption = ""
+if uploaded_image:
+    image = Image.open(uploaded_image)
+    st.image(image, caption="Uploaded Image", use_column_width=True)
+    with st.spinner("🔍 Generating caption with BLIP..."):
+        image_caption = caption_image(image)
+    st.success(f"📝 Caption: `{image_caption}`")
+if st.button("🚀 Run Moderation"):
+    full_text = text_input + " [SEP] " + image_caption
+    with st.spinner("🛡️ Stage 1: Llama Guard..."):
+        safety = llama_guard_filter(full_text)
+    if safety == "unsafe":
+        st.error("❌ Llama Guard flagged this content as **UNSAFE**.\nModeration stopped.")
+    else:
+        st.success("✅ Safe by Llama Guard. Proceeding to classifier...")
+        with st.spinner("🧠 Stage 2: DistilBERT Toxic Classifier..."):
+            label, score, scores = classify_toxicity(text_input, image_caption)
+        st.markdown(f"### 🔍 Prediction: `{label}` ({round(score*100, 2)}%)")
+        st.text("📊 Class Probabilities:\n" + scores)