Spaces:
Running
Running
| import re | |
| from fastapi import FastAPI | |
| from fastapi import Header | |
| from pydantic import BaseModel | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification | |
| import dateparser | |
| from datetime import datetime | |
| from langdetect import detect_langs | |
| from textblob import TextBlob | |
| from dateparser.search import search_dates | |
| import uuid | |
| import time | |
| import warnings | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| warnings.filterwarnings("ignore", category=UserWarning) | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import JSONResponse | |
| from fastapi.responses import ORJSONResponse | |
| from fastapi.requests import Request | |
| from fastapi import status | |
| import asyncio | |
| import psycopg2 | |
| from psycopg2.extras import Json | |
| import os | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| DATABASE_URL = os.getenv("DATABASE_URL") | |
| app = FastAPI(default_response_class=ORJSONResponse) | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], # or your domain(s) | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| CREATE_TABLE_QUERY = """ | |
| CREATE TABLE IF NOT EXISTS user_entries ( | |
| uuid UUID PRIMARY KEY, | |
| user_id TEXT, | |
| user_name TEXT, | |
| uese_email TEXT, | |
| raw_text TEXT, | |
| word_count INT, | |
| day_of_week TEXT, | |
| hour_of_day INT, | |
| month TEXT, | |
| year INT, | |
| type TEXT, | |
| expense_type TEXT, | |
| intent TEXT, | |
| confidence_scores JSONB, | |
| urgency_score INT, | |
| time_mentions TEXT[], | |
| parsed_dates TEXT[], | |
| tense TEXT[], | |
| summary TEXT, | |
| people TEXT[], | |
| mood TEXT, | |
| language JSONB, | |
| sentiment_score FLOAT, | |
| tags TEXT[], | |
| action_required BOOLEAN, | |
| entities JSONB, | |
| amounts JSONB, | |
| stores JSONB, | |
| processing_time_ms INT, | |
| raw_json JSONB, | |
| created_at TIMESTAMPTZ DEFAULT now() | |
| ); | |
| """ | |
| def run_migrations(): | |
| try: | |
| conn = psycopg2.connect(DATABASE_URL) | |
| cur = conn.cursor() | |
| cur.execute(CREATE_TABLE_QUERY) | |
| conn.commit() | |
| cur.close() | |
| conn.close() | |
| print("✅ Table checked/created at startup.") | |
| except Exception as e: | |
| print("❌ Migration failed:", e) | |
| # Load classification and summarization models | |
| classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli") | |
| summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base") | |
| summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base") | |
| # classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| # summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small") | |
| # summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small") | |
| # Load Indic NER (or any general one) | |
| tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") | |
| model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") | |
| ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
| # Labels for classification | |
| labels = [ | |
| "task (something to be done or completed)", | |
| "event (an activity that is happening or has happened)", | |
| "reminder (a message to remember something in the future)", | |
| "meeting (a planned gathering between people to discuss something)", | |
| "relationship (message about personal or emotional connection with someone)", | |
| "note (general note or quick thought not related to any specific category)", | |
| "journal (personal reflection or emotional writing about one's day or thoughts)", | |
| "memory (recollection or recording of a past moment or experience)", | |
| "status_update (current condition, feeling, or situation being shared)", | |
| "sick_notice (informing about illness or not feeling well)", | |
| "out_of_office (message about being unavailable for work or responsibilities)", | |
| "travel_plan (planning or mentioning a trip or journey)", | |
| "celebration (message about a festive occasion, party or achievement)", | |
| "expense (money spent on something, either small or large)", | |
| "news (update about public events, announcements, or current affairs)", | |
| "information (factual content or informative message not tied to user activity)", | |
| "purchase (buying or ordering something, like a product or service)", | |
| "other (does not clearly fall into any specific category)" | |
| ] | |
| POPULAR_STORES = { | |
| "amazon": "shopping", | |
| "flipkart": "shopping", | |
| "myntra": "fashion", | |
| "swiggy": "food", | |
| "zomato": "food", | |
| "uber": "transport", | |
| "ola": "transport", | |
| "bigbasket": "groceries", | |
| "blinkit": "groceries", | |
| "jiomart": "groceries", | |
| "netflix": "entertainment", | |
| "hotstar": "entertainment", | |
| "airbnb": "travel", | |
| "makemytrip": "travel", | |
| "bookmyshow": "entertainment", | |
| "dunzo": "delivery", | |
| "meesho": "shopping", | |
| "nykaa": "beauty", | |
| "instamart": "groceries", | |
| "apple": "electronics", | |
| "google": "services" | |
| } | |
| expense_keywords = [ | |
| "paid", "bought", "purchased", "ordered", "spent", "payment", | |
| "recharged", "booked", "transaction", "debit", "renewed", | |
| "credit card", "cash", "amount", "transfer", "EMI", "wallet", | |
| "petrol", "bill", "invoice", "kharida", "kharcha", "kharch", "bill", "paisa", "khareed", "order", "le liya", "diya", "khud diya", "khud kharida", | |
| "expense", "cost", "buy", "buying", "purchase", "purchased", "paid for", "paid to", "paid via", "paid using", | |
| "expense", "expenses", "costs", "costing", "bills", "bought from", "ordered from", "paid at", | |
| "paid online", "paid cash", "paid card", "paid wallet", "paid app", "paid through", "paid via", | |
| "khariden", "kharidi" | |
| ] | |
| class TextInput(BaseModel): | |
| text: str | |
| user_id: str | |
| # Function to detect popular store categories in the text | |
| def detect_store_category(text: str): | |
| found_stores = [] | |
| lowered = text.lower() | |
| for store, category in POPULAR_STORES.items(): | |
| if store in lowered: | |
| found_stores.append({ | |
| "store": store, | |
| "category": category | |
| }) | |
| return found_stores | |
| # Function to extract dates and time mentions based on regex patterns | |
| def extract_dates_with_accuracy(text: str, amounts: list = None): | |
| amounts = amounts or [] | |
| amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))} | |
| original_text = text | |
| text_lower = text.lower() | |
| # Step 1: Replace Hinglish phrases with English equivalents (only for parsing) | |
| hinglish_map = { | |
| "aaj": "today", | |
| "kal": "tomorrow", # Assuming future | |
| "parso": "day after tomorrow", | |
| "abhi": "now", | |
| "subah": "morning", | |
| "shaam": "evening", | |
| "raat ko": "night", | |
| "agli baar": "next time", | |
| "agli hafte": "next week", | |
| "agli mahine": "next month", | |
| "iss hafte": "this week", | |
| "iss mahine": "this month", | |
| "pichhle hafte": "last week", | |
| "tareekh": "date", | |
| "do din baad": "in 2 days", | |
| "teen din baad": "in 3 days", | |
| } | |
| replaced_text = text_lower | |
| for h_word, en_word in hinglish_map.items(): | |
| replaced_text = re.sub(rf"\b{re.escape(h_word)}\b", en_word, replaced_text) | |
| # Step 2: Parse using dateparser | |
| results = search_dates(replaced_text, settings={ | |
| "PREFER_DATES_FROM": "future", | |
| "RELATIVE_BASE": datetime.now(), | |
| "RETURN_AS_TIMEZONE_AWARE": False, | |
| "STRICT_PARSING": True, | |
| }) | |
| time_mentions = [] | |
| parsed_dates = [] | |
| if results: | |
| for phrase, date in results: | |
| clean_phrase = phrase.strip().lower() | |
| if clean_phrase in amount_values: | |
| continue | |
| if clean_phrase in {"on", "at", "in", "by", "to", "of"}: | |
| continue | |
| if re.fullmatch(r"\d{3,4}", clean_phrase): # skip 2025, 1200 | |
| continue | |
| time_mentions.append(clean_phrase) | |
| parsed_dates.append(date.isoformat()) | |
| return time_mentions, parsed_dates | |
| def detect_tense(parsed_dates): | |
| now = datetime.now() | |
| tenses = set() | |
| for d in parsed_dates: | |
| dt = dateparser.parse(d) | |
| if not dt: | |
| continue | |
| if dt < now: | |
| tenses.add("past") | |
| elif dt > now: | |
| tenses.add("future") | |
| else: | |
| tenses.add("present") | |
| return list(tenses) if tenses else ["unknown"] | |
| def generate_summary(text): | |
| input_ids = summarizer_tokenizer("summarize: " + text, return_tensors="pt").input_ids | |
| output_ids = summarizer_model.generate(input_ids, max_length=60, num_beams=4, early_stopping=True) | |
| return summarizer_tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| def estimate_mood(text): | |
| text_lower = text.lower() | |
| # Expanded mood map with Hindi/Hinglish and phrases | |
| mood_map = { | |
| "happy": [ | |
| "happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated", | |
| "maza aa gaya", "achha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar", "achha" | |
| ], | |
| "sad": [ | |
| "sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy", | |
| "bura lag raha hai", "dukhi", "udaas", "rona", "rona aa gaya", "dil toot gaya", "nirash" | |
| ], | |
| "angry": [ | |
| "angry", "annoyed", "frustrated", "irritated", "mad", "furious", "gussa", "gusse mein", "chidh", "naraz", | |
| "bhadak gaya", "chidh gaya", "irritate", "irritated" | |
| ], | |
| "nervous": [ | |
| "nervous", "anxious", "scared", "worried", "fearful", "uneasy", "tensed", "tension", "ghabrahat", "chinta", | |
| "parishan", "dara hua", "ghabra gaya", "stress", "stressed" | |
| ], | |
| "unwell": [ | |
| "sick", "unwell", "not feeling well", "fever", "cold", "headache", "flu", "ill", "nauseous", "dizzy", | |
| "thak gaya", "thaka hua", "bimaar", "bimar", "bukhar", "sardard", "beemar", "kamjor", "thakan" | |
| ], | |
| "neutral": [ | |
| "ok", "fine", "theek", "normal", "usual", "routine", "nothing special", "kuch khaas nahi", "no stress" | |
| ] | |
| } | |
| detected_moods = [] | |
| for mood, keywords in mood_map.items(): | |
| for kw in keywords: | |
| if kw in text_lower: | |
| detected_moods.append(mood) | |
| break # Only need one match per mood | |
| # Use sentiment as a fallback if no mood keyword matched | |
| if not detected_moods: | |
| sentiment = get_sentiment_score(text) | |
| if sentiment > 0.2: | |
| return "happy" | |
| elif sentiment < -0.2: | |
| return "sad" | |
| else: | |
| return "neutral" | |
| # Priority: angry > sad > unwell > nervous > happy > neutral | |
| priority = ["angry", "sad", "unwell", "nervous", "happy", "neutral"] | |
| for mood in priority: | |
| if mood in detected_moods: | |
| return mood | |
| return "neutral" | |
| def generate_tags(label, text): | |
| # Define stopwords manually (lightweight and fast) | |
| stopwords = set([ | |
| "or", "to", "also", "the", "and", "a", "an", "in", "on", "of", "for", | |
| "with", "at", "by", "from", "as", "is", "was", "are", "be", "will", | |
| "has", "have", "it", "this", "that", "but", "if", "not", "so", "do", | |
| "does", "did", "am", "can", "i", "me", "my", "you", "we", "they", "he", "she" | |
| ]) | |
| base_tags = [label] | |
| # Extract keywords (only alphabetic words with 4 or more letters) | |
| keywords = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()) | |
| # Filter out stopwords | |
| filtered_keywords = [word for word in keywords if word not in stopwords] | |
| # Add forced tags based on context | |
| force_tags = [] | |
| lowered = text.lower() | |
| if any(w in lowered for w in ["sick", "unwell", "not feeling well", "fever"]): | |
| force_tags += ["sick", "leave"] | |
| if "work" in lowered: | |
| force_tags.append("work") | |
| # Merge and deduplicate tags | |
| return list(set(base_tags + force_tags + filtered_keywords)) | |
| # Detect language using langdetect | |
| def detect_language(text): | |
| langs = detect_langs(text) # returns list like: [en:0.99, hi:0.01] | |
| if langs: | |
| top_lang = langs[0] | |
| return {"lang": top_lang.lang, "prob": round(top_lang.prob, 6)} | |
| return {"lang": "unknown", "prob": 0} | |
| # Detect sentiment using TextBlob | |
| def get_sentiment_score(text): | |
| try: | |
| blob = TextBlob(text) | |
| return round(blob.sentiment.polarity, 3) # Range: -1 to 1 | |
| except: | |
| return 0.0 | |
| # Infer intent based on label | |
| def infer_intent(label, text): | |
| label_to_intent = { | |
| "out_of_office": "taking_leave", | |
| "sick_notice": "taking_leave", | |
| "reminder": "set_reminder", | |
| "event": "log_event", | |
| "meeting": "schedule_meeting", | |
| "note": "log_note", | |
| "journal": "log_memory", | |
| "memory": "log_memory", | |
| "status_update": "status_update", | |
| "task": "create_task", | |
| "celebration": "log_event" | |
| } | |
| return label_to_intent.get(label, "other") | |
| # Extract entities using NER | |
| def extract_entities(text): | |
| ner_results = ner_pipeline(text) | |
| entities = {"people": [], "places": [], "organizations": [], "dates": [], "misc": []} | |
| PLACE_KEYWORDS = [ | |
| "garden", "hotel", "resort", "mall", "restaurant", "cafe", "market", | |
| "school", "college", "temple", "station", "airport", "hospital", | |
| "park", "store", "shop", "gym", "theater", "cinema", "bank", "office", | |
| "court", "salon", "studio", "museum", "library", "club", "university", | |
| "guest house", "hostel", "canteen", "clinic", "zoo", "residency", "apartment" | |
| ] | |
| RELATION_KEYWORDS = [ | |
| # English | |
| "mom", "dad", "father", "mother", "sister", "brother", "sis", "bro", | |
| "uncle", "aunt", "aunty", "cousin", "grandfather", "grandmother", | |
| "grandpa", "grandma", "wife", "husband", "son", "daughter", "child", | |
| "kids", "baby", "partner", "fiancé", "fiancée", "in-laws", "relatives", | |
| "friend", "colleague", "buddy", "pal", "mate", "acquaintance", "companion", | |
| "girlfriend", "boyfriend", "lover", "spouse", "significant other", | |
| # Hindi & Hinglish | |
| "maa", "mummy", "papa", "pappa", "pitaji", "mataji", "didi", "behen", "bhai", | |
| "chacha", "chachi", "mama", "mami", "tau", "tai", "nana", "nani", | |
| "dada", "dadi", "sasur", "sasuma", "jija", "saali", "bhabhi", "devar", | |
| "nandoi", "patni", "pati", "bachcha", "baccha", "beta", "beti", "putra", "putri", | |
| "sambandhi", "rishtedaar", "saheli", "dost", "yara", "saathi" | |
| ] | |
| for ent in ner_results: | |
| word = ent["word"].replace("##", "") | |
| if len(word) <= 2 or not word.isalpha(): | |
| continue # skip single-letter non-words | |
| group = ent["entity_group"] | |
| if group == "PER": | |
| entities["people"].append(word) | |
| elif group == "LOC": | |
| entities["places"].append(word) | |
| elif group == "ORG": | |
| entities["organizations"].append(word) | |
| elif group == "DATE": | |
| entities["dates"].append(word) | |
| else: | |
| entities["misc"].append(word) | |
| # ✅ Fallback: Add known days/dates if not already captured | |
| day_keywords = re.findall(r'\b(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', text, re.IGNORECASE) | |
| for day in day_keywords: | |
| if day not in entities["dates"]: | |
| entities["dates"].append(day) | |
| # ✅ Fallback: Add phrases like “product launch”, “project”, etc. to misc | |
| lower_text = text.lower() | |
| if "product launch" in lower_text: | |
| entities["misc"].append("product launch") | |
| if "birthday" in lower_text: | |
| entities["misc"].append("birthday") | |
| if "project" in lower_text: | |
| entities["misc"].append("project") | |
| # ✅ Add keyword-based places | |
| for place in PLACE_KEYWORDS: | |
| if place in lower_text and place not in entities["places"]: | |
| entities["places"].append(place) | |
| # ✅ Detect relation keywords (English + Hindi) | |
| for relation in RELATION_KEYWORDS: | |
| if re.search(rf"\b{re.escape(relation)}\b", text.lower()): | |
| entities["people"].append(relation) | |
| # ✅ Deduplicate and return | |
| return {k: list(set(v)) for k, v in entities.items()} | |
| # Function to calculate urgency score based on parsed dates | |
| def get_urgency_score(text, parsed_dates): | |
| urgency_keywords = ["urgent", "asap", "immediate", "must", "need to", "important", "don’t forget", "right away"] | |
| text_lower = text.lower() | |
| score = 0.0 | |
| # 1. Keyword-based boost | |
| if any(word in text_lower for word in urgency_keywords): | |
| score = 0.7 | |
| # 2. Time-based boost | |
| now = datetime.now() | |
| for d in parsed_dates: | |
| dt = dateparser.parse(d) | |
| if dt: | |
| hours = (dt - now).total_seconds() / 3600 | |
| if 0 <= hours <= 24: | |
| score = max(score, 1.0) | |
| elif 24 < hours <= 72: | |
| score = max(score, 0.8) | |
| elif 72 < hours <= 168: | |
| score = max(score, 0.5) | |
| return round(score, 2) | |
| # Function to get meta information about the text | |
| def get_meta_info(text: str): | |
| now = datetime.now() | |
| return { | |
| "word_count": len(text.strip().split()), | |
| "day_of_week": now.strftime('%A'), # e.g., "Thursday" | |
| "hour_of_day": now.hour, | |
| "month": now.strftime('%B'), # e.g., "July" | |
| "year": now.year # 0 to 23 | |
| } | |
| def is_year_context(text_snippet): | |
| return bool(re.search(r"\b(?:jan|feb|march|april|may|june|july|aug|sept|oct|nov|dec|year|in|on|by|for)\b", text_snippet)) | |
| # Function to extract amounts in various currencies from text | |
| def extract_amounts(text: str): | |
| currency_patterns = [ | |
| # INR variants | |
| (re.compile(r"(?:₹|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)"), "INR"), | |
| (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?(?:₹|rs\.?|inr)"), "INR"), | |
| (re.compile(r"(\d+(?:\.\d+)?)\s?(rupees?|rupaye|rupiye)"), "INR"), | |
| # USD variants | |
| (re.compile(r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)"), "USD"), | |
| (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?\$"), "USD"), | |
| (re.compile(r"(\d+(?:\.\d+)?)\s?(dollars?)"), "USD"), | |
| (re.compile(r"(\d+(?:\.\d+)?)\s?(cents?)"), "USD"), | |
| # EUR variants | |
| (re.compile(r"(?:€|eur)\s?(\d[\d,]*(?:\.\d+)?)"), "EUR"), | |
| (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?€"), "EUR"), | |
| (re.compile(r"(\d+(?:\.\d+)?)\s?(euros?)"), "EUR"), | |
| # GBP variants | |
| (re.compile(r"(?:£|gbp)\s?(\d[\d,]*(?:\.\d+)?)"), "GBP"), | |
| (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?£"), "GBP"), | |
| (re.compile(r"(\d+(?:\.\d+)?)\s?(pounds?)"), "GBP"), | |
| # INR large units | |
| (re.compile(r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)"), "INR"), | |
| (re.compile(r"(\d+(?:\.\d+)?)\s?(crores?|crs?|cr)"), "INR"), | |
| ] | |
| results = [] | |
| seen = set() | |
| text_lower = text.lower() | |
| for pattern, currency_code in currency_patterns: | |
| for match in pattern.finditer(text_lower): | |
| groups = match.groups() | |
| raw_number = next((g for g in groups if re.match(r"\d", g)), None) | |
| if not raw_number: | |
| continue | |
| # Ignore phone numbers and IDs (10+ digits) | |
| if len(raw_number.replace(",", "")) >= 10: | |
| continue | |
| try: | |
| number = float(raw_number.replace(",", "")) | |
| # Check for lakh/crore/cents multipliers | |
| if any(g in ['lakh', 'lacs', 'lakhs'] for g in groups): | |
| number *= 100_000 | |
| elif any(g in ['crore', 'crores', 'cr', 'crs'] for g in groups): | |
| number *= 10_000_000 | |
| elif any(g == 'cents' for g in groups): | |
| number /= 100 | |
| except Exception: | |
| continue | |
| key = (number, currency_code) | |
| if key not in seen: | |
| seen.add(key) | |
| results.append({ | |
| "value": round(number, 2), | |
| "currency": currency_code | |
| }) | |
| # Fallback matching for generic numeric phrases near expense keywords | |
| if not results: | |
| fallback_patterns = [ | |
| re.compile( | |
| r"\b(?:paid|spent|buy|purchase|cost|price|add(?:ed)?|gift(?:ed)?|bill(?: of)?|recharge(?:d)?|charged|transfer(?:red)?)\b[^0-9]{0,10}(\d[\d,]*(?:\.\d+)?)" | |
| ), | |
| re.compile(r"\b(\d[\d,]{2,8})\b\s?(?:rs|inr)?") | |
| ] | |
| for fallback_pattern in fallback_patterns: | |
| match = fallback_pattern.search(text_lower) | |
| if match: | |
| number_str = match.group(1).replace(",", "") | |
| # Ignore phone numbers and IDs | |
| if len(number_str) >= 10: | |
| continue | |
| try: | |
| number = float(number_str) | |
| # Context check for year-like numbers | |
| if 2020 <= number <= 2100: | |
| # Check 5-6 words before/after for year clue | |
| span = match.span(1) | |
| surrounding = text_lower[max(0, span[0]-30):span[1]+30] | |
| if is_year_context(surrounding): | |
| continue # Looks like a year | |
| key = (number, "INR") | |
| if key not in seen: | |
| seen.add(key) | |
| results.append({ | |
| "value": round(number, 2), | |
| "currency": "INR" | |
| }) | |
| break # Only extract first match in fallback | |
| except: | |
| continue | |
| return results | |
| def predict_expense_category(text, detected_stores): | |
| text_lower = text.lower() | |
| # 1. Use detected store category if available | |
| if detected_stores: | |
| best_match = max(detected_stores, key=lambda s: s.get("confidence", 1.0)) | |
| return best_match["category"] | |
| # Category keyword mapping | |
| category_keywords = { | |
| "food": [ | |
| "food", "lunch", "dinner", "breakfast", "snacks", "swiggy", "zomato", "dominos", "pizza", "kfc", "mcdonald", | |
| "restaurant", "hotel", "cafe", "canteen", "meal", "buffet", "thali", "tiffin", "order", "takeaway", "parcel", | |
| "eat", "eating", "brunch", "supper", "kitchen", "cook", "cooking", "chef", "dish", "dishes", "menu", "serve", | |
| "served", "serving", "food court", "food delivery", "delivery", "online order", "food app", "food bill", | |
| "beverage", "juice", "shake", "smoothie", "coffee", "tea", "chai", "cold drink", "soft drink", "soda", "water bottle", | |
| "ice cream", "dessert", "sweet", "sweets", "chocolate", "candy", "bakery", "bread", "cake", "pastry", "cookie", | |
| "biscuit", "chips", "fries", "burger", "sandwich", "roll", "wrap", "noodles", "pasta", "rice", "biryani", "curry", | |
| "gravy", "dal", "sabzi", "roti", "naan", "paratha", "chapati", "idli", "dosa", "vada", "sambar", "chutney", "samosa", | |
| "pakora", "chaat", "pani puri", "golgappa", "sev", "poha", "upma", "maggi", "maggie", "momos", "spring roll", | |
| "manchurian", "paneer", "butter chicken", "tandoori", "kebab", "shawarma", "pizza hut", "subway", "starbucks", | |
| # Hindi/Hinglish | |
| "khana", "nashta", "bhojan", "rasoi", "thali", "dabba", "tiffin", "chai", "paani", "jal", "kharcha khana", | |
| "khane ka bill", "khane ka paisa", "khane ki cheez", "khana order", "khana mangwaya", "khana khaya", "khana khud banaya", | |
| "khana kharch", "khana kharida", "khana diya", "khana laya", "khana banaya" | |
| ], | |
| "transport": [ | |
| "uber", "ola", "taxi", "cab", "bus", "train", "metro", "flight", "auto", "rickshaw", "car", "gaadi", "yatra", "safar", "travel", "ticket", "plane", "udaan", "station", "airport", "rapido", | |
| ], | |
| "shopping": [ | |
| "amazon", "flipkart", "myntra", "shopping", "clothes", "kapde", "apparel", "shoes", "jeans", "tshirt", "store", "fashion", "dukaan", "mall", "bazaar", "market", "kharida", "order diya", "le liya" | |
| ], | |
| "housing": [ | |
| "rent", "apartment", "house", "ghar", "flat", "maintenance", "landlord", "kiraya", "makaan", "room", "hostel", "pg", "society" | |
| ], | |
| "utilities": [ | |
| "electricity", "power", "bijli", "water", "pani", "gas", "bill", "recharge", "broadband", "wifi", "airtel", "jio", "phone", "mobile", "internet", "light", "cylinder", "connection" | |
| ], | |
| "entertainment": [ | |
| "movie", "netflix", "hotstar", "bookmyshow", "spotify", "gaming", "youtube premium", "cinema", "film", "picture", "game", "khel", "manoranjan", "show", "concert" | |
| ], | |
| "health": [ | |
| "medicine", "hospital", "doctor", "clinic", "pharmacy", "tablet", "surgery", "checkup", "dawai", "aspatal", "ilaaj", "health", "bimari", "test", "medical", "pathology", "chemist" | |
| ], | |
| "travel": [ | |
| "trip", "travel", "tour", "vacation", "hotel", "airbnb", "booking.com", "goibibo", "makemytrip", "yatra", "safar", "holiday", "journey", "musafir", "booking", "trip kiya" | |
| ], | |
| "education": [ | |
| "course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha" | |
| ], | |
| "digital_services": [ | |
| "domain", "membership", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal" | |
| ], | |
| "gifts_donations": [ | |
| "gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara" | |
| ], | |
| "finance": [ | |
| "insurance", "sip", "mutual fund", "stock", "demat", "zerodha", "investment", "trading", "upstox", "crypto", "policy", "premium", "loan", "emi", "fd", "rd", "paisa", "bank", "account" | |
| ], | |
| "family_kids": [ | |
| "kid", "baby", "school", "daycare", "tuition", "books", "uniform", "toys", "creche", "baccha", "bachche", "parivar", "family", "beti", "beta", "child", "children" | |
| ], | |
| "stationery": [ | |
| "pen", "pencil", "notebook", "diary", "eraser", "sharpener", "paper", "stationery", "register", "files", "file", "markers", "highlighter", "sticky notes", "geometry box", | |
| "stapler", "ink", "printer paper", "stationary shop", "stationary", "copy", "kagaz", "likhne ka saman" | |
| ] | |
| } | |
| # 2. Match using keyword scores | |
| matched = {cat: sum(1 for kw in kws if kw in text_lower) for cat, kws in category_keywords.items()} | |
| best_match = max(matched.items(), key=lambda x: x[1]) | |
| if best_match[1] > 0: | |
| return best_match[0] | |
| return "miscellaneous" | |
| def insert_text_entry(data): | |
| try: | |
| conn = psycopg2.connect(DATABASE_URL) | |
| cur = conn.cursor() | |
| insert_query = """ | |
| INSERT INTO user_entries ( | |
| uuid, user_id, raw_text, word_count, day_of_week, hour_of_day, month, year, | |
| type, expense_type, intent, confidence_scores, urgency_score, | |
| time_mentions, parsed_dates, tense, summary, | |
| people, mood, language, sentiment_score, tags, | |
| action_required, entities, amounts, stores, processing_time_ms, raw_json | |
| ) VALUES ( | |
| %(uuid)s, %(user_id)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s, | |
| %(type)s, %(expense_type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s, | |
| %(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s, | |
| %(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s, | |
| %(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s, %(raw_json)s | |
| ) | |
| ON CONFLICT (uuid) DO NOTHING; | |
| """ | |
| cur.execute(insert_query, { | |
| **data, | |
| "confidence_scores": Json(data["confidence_scores"]), | |
| "language": Json(data["language"]), | |
| "stores": Json(data["stores"]), | |
| "entities": Json(data["entities"]), | |
| "amounts": Json(data["amounts"]), | |
| "raw_json": Json(data["raw_json"]) | |
| }) | |
| conn.commit() | |
| cur.close() | |
| conn.close() | |
| print("✅ Data inserted successfully") | |
| except Exception as e: | |
| print("❌ Failed to insert data:", e) | |
| def health_check(): | |
| return {"message": "✅ Hello from yourpartner/demospace — API is running!"} | |
| async def not_found_handler(request: Request, exc): | |
| return ORJSONResponse(status_code=404, content={"error": "Route not found"}) | |
| async def internal_error_handler(request: Request, exc): | |
| return ORJSONResponse(status_code=500, content={"error": "Internal server error: " + str(exc)}) | |
| # Delete user's Expenses | |
| async def delete_user_expenses(user_id: str): | |
| if not user_id: | |
| return ORJSONResponse( | |
| status_code=400, | |
| content={"error": "User ID is required"} | |
| ) | |
| try: | |
| conn = psycopg2.connect(DATABASE_URL) | |
| cur = conn.cursor() | |
| # Delete all entries for the user | |
| cur.execute("DELETE FROM user_entries WHERE user_id = %s", (user_id,)) | |
| conn.commit() | |
| cur.close() | |
| conn.close() | |
| return ORJSONResponse( | |
| status_code=200, | |
| content={ | |
| "message": "User entries deleted successfully" | |
| } | |
| ) | |
| except Exception as e: | |
| return ORJSONResponse( | |
| status_code=500, | |
| content={"error": f"Database error: {str(e)}"} | |
| ) | |
| # List expenses of user by id | |
| async def get_user_expenses(user_id: str): | |
| if not user_id: | |
| return ORJSONResponse( | |
| status_code=400, | |
| content={"error": "User ID is required"} | |
| ) | |
| try: | |
| conn = psycopg2.connect(DATABASE_URL) | |
| cur = conn.cursor() | |
| # SQL query to fetch only required columns for expenses | |
| query = """ | |
| SELECT raw_text, type, uuid, expense_type, summary, amounts, created_at | |
| FROM user_entries | |
| WHERE user_id = %s AND type = 'expense' | |
| ORDER BY created_at DESC | |
| """ | |
| cur.execute(query, (user_id,)) | |
| rows = cur.fetchall() | |
| # Convert rows to list of dictionaries with column names | |
| columns = ['raw_text', 'type', 'uuid', 'expense_type', 'summary', 'amounts', 'created_at'] | |
| expenses = [dict(zip(columns, row)) for row in rows] | |
| cur.close() | |
| conn.close() | |
| return ORJSONResponse(content={"expenses": expenses}) | |
| except Exception as e: | |
| return ORJSONResponse( | |
| status_code=500, | |
| content={"error": f"Database error: {str(e)}"} | |
| ) | |
| # Search endpoint to filter user entries based on various criteria | |
| async def search_entries( | |
| userid: str = Header(..., description="User ID"), | |
| tags: str = "", | |
| query: str = "", | |
| startDate: str = "", | |
| endDate: str = "", | |
| type: str = "" | |
| ): | |
| # Validate user_id from header | |
| if not userid or not userid.strip(): | |
| return ORJSONResponse(status_code=400, content={"error": "Missing or empty userid header."}) | |
| # Build SQL filters | |
| filters = ["user_id = %s"] | |
| params = [userid] | |
| if type: | |
| filters.append("type = %s") | |
| params.append(type) | |
| if tags: | |
| tag_list = [t.strip() for t in tags.split(",") if t.strip()] | |
| filters.append("tags && %s") | |
| params.append(tag_list) | |
| if query: | |
| filters.append("(raw_text ILIKE %s OR summary ILIKE %s)") | |
| params.extend([f"%{query}%", f"%{query}%"]) | |
| if startDate: | |
| try: | |
| start_dt = datetime.strptime(startDate, "%d-%m-%Y") | |
| filters.append("created_at >= %s") | |
| params.append(start_dt) | |
| except: | |
| return ORJSONResponse(status_code=400, content={"error": "Invalid startDate format. Use DD-MM-YYYY."}) | |
| if endDate: | |
| try: | |
| end_dt = datetime.strptime(endDate, "%d-%m-%Y") | |
| filters.append("created_at <= %s") | |
| params.append(end_dt) | |
| except: | |
| return ORJSONResponse(status_code=400, content={"error": "Invalid endDate format. Use DD-MM-YYYY."}) | |
| where_clause = " AND ".join(filters) | |
| query_sql = f"SELECT * FROM user_entries WHERE {where_clause} ORDER BY created_at DESC LIMIT 50" | |
| try: | |
| conn = psycopg2.connect(DATABASE_URL) | |
| cur = conn.cursor() | |
| cur.execute(query_sql, tuple(params)) | |
| rows = cur.fetchall() | |
| columns = [desc[0] for desc in cur.description] | |
| entries = [dict(zip(columns, row)) for row in rows] | |
| # Remove raw_json from each entry in results | |
| for entry in entries: | |
| entry.pop("raw_json", None) | |
| cur.close() | |
| conn.close() | |
| except Exception as e: | |
| return ORJSONResponse(status_code=500, content={"error": str(e)}) | |
| return ORJSONResponse(content={"results": entries}) | |
| async def visualyse_dashboard(user_id: str): | |
| try: | |
| conn = psycopg2.connect(DATABASE_URL) | |
| cur = conn.cursor() | |
| # Fetch all entries for the user | |
| cur.execute("SELECT * FROM user_entries WHERE user_id = %s", (user_id,)) | |
| rows = cur.fetchall() | |
| columns = [desc[0] for desc in cur.description] | |
| entries = [dict(zip(columns, row)) for row in rows] | |
| cur.close() | |
| conn.close() | |
| except Exception as e: | |
| return ORJSONResponse(status_code=500, content={"error": str(e)}) | |
| # Section 1: Expense Overview | |
| expenses = [e for e in entries if e["type"] == "expense"] | |
| total_expense = sum(a["value"] for e in expenses for a in (e["amounts"] or [])) | |
| expense_count = len(expenses) | |
| expense_by_category = {} | |
| for e in expenses: | |
| cat = e.get("expense_type", "miscellaneous") | |
| amt = sum(a["value"] for a in (e["amounts"] or [])) | |
| expense_by_category[cat] = expense_by_category.get(cat, 0) + amt | |
| # Monthly/Weekly Trends | |
| monthly_trends = {} | |
| for e in expenses: | |
| key = f"{e['month']}-{e['year']}" | |
| amt = sum(a["value"] for a in (e["amounts"] or [])) | |
| monthly_trends[key] = monthly_trends.get(key, 0) + amt | |
| # Section 2: Top Stores & Categories | |
| store_stats = {} | |
| for e in expenses: | |
| for s in (e["stores"] or []): | |
| store = s.get("store", "unknown") | |
| amt = sum(a["value"] for a in (e["amounts"] or [])) | |
| if store not in store_stats: | |
| store_stats[store] = {"count": 0, "total": 0} | |
| store_stats[store]["count"] += 1 | |
| store_stats[store]["total"] += amt | |
| top_categories = sorted(expense_by_category.items(), key=lambda x: x[1], reverse=True) | |
| # Section 3: Recent Expenses | |
| recent_expenses = sorted(expenses, key=lambda e: e.get("created_at", ""), reverse=True)[:7] | |
| # Section 4: Mood Trends | |
| mood_dist = {} | |
| for e in entries: | |
| mood = e.get("mood", "neutral") | |
| mood_dist[mood] = mood_dist.get(mood, 0) + 1 | |
| # Section 5: Tags & Keywords | |
| tag_freq = {} | |
| for e in entries: | |
| for tag in (e["tags"] or []): | |
| tag_freq[tag] = tag_freq.get(tag, 0) + 1 | |
| top_tags = sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)[:15] | |
| # Section 6: Time Analysis | |
| day_stats = {} | |
| hour_stats = {} | |
| for e in expenses: | |
| day = e.get("day_of_week", "unknown") | |
| hour = e.get("hour_of_day", 0) | |
| amt = sum(a["value"] for a in (e["amounts"] or [])) | |
| day_stats[day] = day_stats.get(day, 0) + amt | |
| hour_stats[hour] = hour_stats.get(hour, 0) + amt | |
| # Section 7: Meta Info | |
| entry_count = len(entries) | |
| type_dist = {} | |
| for e in entries: | |
| t = e.get("type", "other") | |
| type_dist[t] = type_dist.get(t, 0) + 1 | |
| dashboard = { | |
| "expense_overview": { | |
| "total_expense": total_expense, | |
| "expense_count": expense_count, | |
| "expense_by_category": expense_by_category, | |
| "monthly_trends": monthly_trends | |
| }, | |
| "top_stores": store_stats, | |
| "top_categories": top_categories, | |
| "recent_expenses": recent_expenses, | |
| "mood_distribution": mood_dist, | |
| "top_tags": top_tags, | |
| "time_analysis": { | |
| "by_day": day_stats, | |
| "by_hour": hour_stats | |
| }, | |
| "meta_info": { | |
| "entry_count": entry_count, | |
| "type_distribution": type_dist | |
| } | |
| } | |
| return ORJSONResponse(content=dashboard) | |
| async def analyze(input: TextInput): | |
| start_time = time.time() # ⏱️ start | |
| text = input.text | |
| label_map = { | |
| "task (something to be done or completed)": "task", | |
| "event (an activity that is happening or has happened)": "event", | |
| "reminder (a message to remember something in the future)": "reminder", | |
| "meeting (a planned gathering between people to discuss something)": "meeting", | |
| "relationship (message about personal or emotional connection with someone)": "relationship", | |
| "note (general note or quick thought not related to any specific category)": "note", | |
| "journal (personal reflection or emotional writing about one's day or thoughts)": "journal", | |
| "memory (recollection or recording of a past moment or experience)": "memory", | |
| "status_update (current condition, feeling, or situation being shared)": "status_update", | |
| "sick_notice (informing about illness or not feeling well)": "sick_notice", | |
| "out_of_office (message about being unavailable for work or responsibilities)": "out_of_office", | |
| "travel_plan (planning or mentioning a trip or journey)": "travel_plan", | |
| "celebration (message about a festive occasion, party or achievement)": "celebration", | |
| "expense (money spent on something, either small or large)": "expense", | |
| "news (update about public events, announcements, or current affairs)": "news", | |
| "information (factual content or informative message not tied to user activity)": "information", | |
| "purchase (buying or ordering something, like a product or service)": "purchase", | |
| "other (does not clearly fall into any specific category)": "other" | |
| } | |
| # classification = classifier(text, labels) | |
| # Async call to classifier | |
| classification = await asyncio.to_thread(classifier, text, labels, hypothesis_template="This entry is about {}.") | |
| best_label = classification['labels'][0] | |
| best_label = label_map.get(best_label, best_label) | |
| amounts = await asyncio.to_thread(extract_amounts, text) | |
| # Check if the best label is expense or purchase based on keywords | |
| if ( | |
| best_label == "task" | |
| and (any(word in text.lower() for word in expense_keywords) or amounts) | |
| ): | |
| best_label = "expense" | |
| if best_label == "purchase": | |
| best_label = "expense" | |
| if "reported" in text or "announced" in text or "collapsed" in text: | |
| if best_label in ["task", "reminder", "event"]: | |
| best_label = "news" | |
| scores = dict(zip(classification['labels'], classification['scores'])) | |
| # # Convert to short labels | |
| confidence_scores_full = { | |
| label_map.get(label, label): score | |
| for label, score in scores.items() | |
| } | |
| # Only keep top 2 | |
| confidence_scores = dict(sorted(confidence_scores_full.items(), key=lambda x: x[1], reverse=True)[:2]) | |
| parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts) | |
| tenses = detect_tense(parsed_dates) | |
| summary = await asyncio.to_thread(generate_summary, text) | |
| mood = estimate_mood(text) | |
| tags = generate_tags(best_label, text) | |
| language_detected = detect_language(text) | |
| sentiment_score = get_sentiment_score(text) | |
| if sentiment_score is None or sentiment_score == "": | |
| sentiment_score = 0.0 | |
| entities = await asyncio.to_thread(extract_entities, text) | |
| people = entities["people"] # Extracted people entities | |
| intent = infer_intent(best_label, text) | |
| urgency_score = get_urgency_score(text, parsed_dates) | |
| detected_stores = detect_store_category(text) | |
| expense_category = "" | |
| if best_label == "expense" or best_label == "purchase": | |
| expense_category = predict_expense_category(text, detected_stores) | |
| # Define action triggers | |
| ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"] | |
| action_required = False | |
| if any(word in text.lower() for word in ACTION_TRIGGERS): action_required = True | |
| action_required = urgency_score >= 0.6 or action_required | |
| meta = get_meta_info(text) | |
| end_time = time.time() # ⏱️ end | |
| processing_time_ms = round((end_time - start_time) * 1000) | |
| result = { | |
| "uuid": str(uuid.uuid4()), # Unique identifier for the request | |
| "user_id": input.user_id, # Unique identifier for the request | |
| "raw_text": text, | |
| "word_count": meta["word_count"], | |
| "day_of_week": meta["day_of_week"], | |
| "hour_of_day": meta["hour_of_day"], | |
| "month": meta["month"], | |
| "year": meta["year"], | |
| "type": best_label, | |
| "expense_type": expense_category, | |
| "intent": intent, | |
| "confidence_scores": confidence_scores, | |
| "urgency_score": urgency_score, | |
| "time_mentions": time_mentions, | |
| "parsed_dates": parsed_dates, | |
| "tense": tenses, | |
| "summary": summary.removeprefix("summary:").strip(), | |
| "people": people, | |
| "mood": mood, | |
| "language": language_detected, | |
| "sentiment_score": sentiment_score, | |
| "tags": tags, | |
| "action_required": action_required, | |
| "entities": entities, | |
| "amounts": amounts, | |
| "stores": detected_stores, | |
| "processing_time_ms": processing_time_ms | |
| } | |
| # Store a copy of result without raw_json to avoid circular reference | |
| raw_json_copy = result.copy() | |
| # Remove raw_json if present (shouldn't be, but for safety) | |
| raw_json_copy.pop("raw_json", None) | |
| result["raw_json"] = raw_json_copy | |
| # Insert into database | |
| await asyncio.to_thread(insert_text_entry, result) | |
| # Log the result | |
| print("✅ Analysis complete") | |
| # Remove raw_json from response | |
| result.pop("raw_json", None) | |
| # Return the result as JSON response | |
| return ORJSONResponse(content=result) |