Spaces:
Sleeping
Sleeping
Commit
·
31d4f49
1
Parent(s):
b7c1815
deepnote update
Browse files- faq.py +31 -16
- requirements.txt +1 -0
faq.py
CHANGED
|
@@ -1,20 +1,22 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
from langchain.document_loaders import DataFrameLoader
|
| 3 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 4 |
-
from langchain.vectorstores import AwaDB
|
| 5 |
from typing import List, Tuple
|
| 6 |
from langchain.docstore.document import Document
|
| 7 |
from langchain.embeddings.base import Embeddings
|
| 8 |
from langchain.vectorstores.base import VectorStore
|
| 9 |
import os
|
| 10 |
import shutil
|
|
|
|
| 11 |
|
| 12 |
SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
|
| 13 |
SHEET_URL_Y = "/edit#gid="
|
| 14 |
SHEET_URL_Y_EXPORT = "/export?gid="
|
| 15 |
-
|
| 16 |
VECTORDB_FOLDER = ".vectordb"
|
| 17 |
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def faq_id(sheet_url: str) -> str:
|
|
@@ -41,26 +43,39 @@ def define_embedding_function(model_name: str) -> HuggingFaceEmbeddings:
|
|
| 41 |
return HuggingFaceEmbeddings(
|
| 42 |
model_name=model_name,
|
| 43 |
encode_kwargs={"normalize_embeddings": True},
|
| 44 |
-
cache_folder=
|
| 45 |
)
|
| 46 |
|
| 47 |
|
| 48 |
def get_vectordb(
|
| 49 |
-
faq_id: str, embedding_function: Embeddings, documents: List[Document] = None
|
| 50 |
) -> VectorStore:
|
| 51 |
vectordb = None
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
return vectordb
|
| 65 |
|
| 66 |
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
from langchain.document_loaders import DataFrameLoader
|
| 3 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 4 |
+
from langchain.vectorstores import AwaDB, Chroma
|
| 5 |
from typing import List, Tuple
|
| 6 |
from langchain.docstore.document import Document
|
| 7 |
from langchain.embeddings.base import Embeddings
|
| 8 |
from langchain.vectorstores.base import VectorStore
|
| 9 |
import os
|
| 10 |
import shutil
|
| 11 |
+
from enum import Enum
|
| 12 |
|
| 13 |
SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
|
| 14 |
SHEET_URL_Y = "/edit#gid="
|
| 15 |
SHEET_URL_Y_EXPORT = "/export?gid="
|
| 16 |
+
EMBEDDING_MODEL_FOLDER = ".embedding-model"
|
| 17 |
VECTORDB_FOLDER = ".vectordb"
|
| 18 |
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
|
| 19 |
+
VECTORDB_TYPE = Enum("VECTORDB_TYPE", ["AwaDB", "Chroma"])
|
| 20 |
|
| 21 |
|
| 22 |
def faq_id(sheet_url: str) -> str:
|
|
|
|
| 43 |
return HuggingFaceEmbeddings(
|
| 44 |
model_name=model_name,
|
| 45 |
encode_kwargs={"normalize_embeddings": True},
|
| 46 |
+
cache_folder=EMBEDDING_MODEL_FOLDER,
|
| 47 |
)
|
| 48 |
|
| 49 |
|
| 50 |
def get_vectordb(
|
| 51 |
+
faq_id: str, embedding_function: Embeddings, documents: List[Document] = None, vectordb_type: str = VECTORDB_TYPE.AwaDB
|
| 52 |
) -> VectorStore:
|
| 53 |
vectordb = None
|
| 54 |
+
|
| 55 |
+
if vectordb_type is VECTORDB_TYPE.AwaDB:
|
| 56 |
+
if documents is None:
|
| 57 |
+
vectordb = AwaDB(embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER)
|
| 58 |
+
if not vectordb.load_local(table_name=faq_id):
|
| 59 |
+
raise Exception("faq_id may not exists")
|
| 60 |
+
else:
|
| 61 |
+
vectordb = AwaDB.from_documents(
|
| 62 |
+
documents=documents,
|
| 63 |
+
embedding=embedding_function,
|
| 64 |
+
table_name=faq_id,
|
| 65 |
+
log_and_data_dir=VECTORDB_FOLDER,
|
| 66 |
+
)
|
| 67 |
+
if vectordb_type is VECTORDB_TYPE.Chroma:
|
| 68 |
+
if documents is None:
|
| 69 |
+
vectordb = Chroma(collection_name=faq_id, embedding_function=embedding_function, persist_directory=VECTORDB_FOLDER)
|
| 70 |
+
if not vectordb.get()["ids"]:
|
| 71 |
+
raise Exception("faq_id may not exists")
|
| 72 |
+
else:
|
| 73 |
+
vectordb = Chroma.from_documents(
|
| 74 |
+
documents=documents,
|
| 75 |
+
embedding=embedding_function,
|
| 76 |
+
collection_name=faq_id,
|
| 77 |
+
persist_directory=VECTORDB_FOLDER,
|
| 78 |
+
)
|
| 79 |
return vectordb
|
| 80 |
|
| 81 |
|
requirements.txt
CHANGED
|
@@ -2,6 +2,7 @@ openpyxl
|
|
| 2 |
langchain
|
| 3 |
sentence_transformers
|
| 4 |
awadb
|
|
|
|
| 5 |
fastapi
|
| 6 |
uvicorn
|
| 7 |
gradio==3.35.2
|
|
|
|
| 2 |
langchain
|
| 3 |
sentence_transformers
|
| 4 |
awadb
|
| 5 |
+
chromadb
|
| 6 |
fastapi
|
| 7 |
uvicorn
|
| 8 |
gradio==3.35.2
|