filius-Dei's picture
Upload 2 files
b534b3f verified
# -*- coding: utf-8 -*-
"""CiPE_Streamlit
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1jACLFXfsdWM59lrfTQGcZVsTIHBO92R8
"""
# Om Maa
!pip install langchain predictionguard lancedb html2text sentence-transformers PyPDF2
!pip install huggingface_hub
!pip install transformers
!pip install sentencepiece
!pip install streamlit
import os
import urllib.request
import html2text
import predictionguard as pg
from langchain import PromptTemplate, FewShotPromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np
import lancedb
from lancedb.embeddings import with_embeddings
import pandas as pd
os.environ['PREDICTIONGUARD_TOKEN'] = "q1VuOjnffJ3NO2oFN8Q9m8vghYc84ld13jaqdF7E"
# Streamlit App Initiation
import streamlit as st
# Replace input() with Streamlit's input widgets
# Sidebar for inputting the name, age, gender, and ethnicity
name = st.sidebar.text_input('Name')
age = st.sidebar.number_input('Age', min_value=0, max_value=120, step=1)
gender = st.sidebar.selectbox('Gender', ['Male', 'Female', 'Other'])
ethnicity = st.sidebar.text_input('Ethnicity')
# Main container
with st.form(key='patient_form'):
# Text input for procedures
disease = st.text_area('DISEASE', height=100)
# Text input for prescriptions (where you would get drug_names)
prescriptions = st.text_area('PRESCRIPTIONS', height=100)
# Text input for additional information
additional_info = st.text_area('ADDITIONAL INFO', height=100)
# Submit button for the form
submit_button = st.form_submit_button(label='Predict Drug Effects')
from PyPDF2 import PdfReader
# Replace 'path_to_your_pdf_file.pdf' with the path to your PDF file
pdf_path = '/content/drug_side_effects_summary_cleaned.pdf'
reader = PdfReader(pdf_path)
# Initialize an empty string to accumulate text
text = ''
# Iterate over each page in the PDF
for page in reader.pages:
# Extract text from the page and append it to the text string
text += page.extract_text() + "\n"
# Now, `text` contains the text content of the PDF. You can print it or process it further.
print(text[:500]) # Example: print the first 500 characters to understand the structure
import re
# Function to clean the extracted text
def clean_text(text):
# Correcting unwanted line breaks and spaces
text = re.sub(r'-\n', '', text) # Remove hyphenation
text = re.sub(r'\n', ' ', text) # Replace new lines with space
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
text = text.strip() # Remove leading and trailing spaces
return text
# Clean the extracted text
cleaned_text = clean_text(text)
# Return a portion of the cleaned text to verify the cleaning
cleaned_text[:500]
# Define a function to chunk text with specified size and overlap using standard Python
def chunk_text(text, chunk_size=700, overlap=50):
chunks = []
start = 0
while start < len(text):
# If we're not at the beginning, move back 'overlap' characters for context
if start > 0:
start -= overlap
end = start + chunk_size
chunks.append(text[start:end])
start += chunk_size
return chunks
# Chunk the cleaned text into smaller pieces for LLM input
docs_alternative = chunk_text(cleaned_text, chunk_size=700, overlap=50)
# Prepare to display the first few chunks to verify the result
chunks_to_display_alt = 3
chunks_preview_alt = [docs_alternative[i] for i in range(min(len(docs_alternative), chunks_to_display_alt))]
chunks_preview_alt
# Format the chunks to avoid prompt template conflicts
chunks_preview_alt = [x.replace('#', '-') for x in chunks_preview_alt]
# Embeddings setup
name = "all-MiniLM-L12-v2"
model = SentenceTransformer(name)
# Embedding functions
def embed_batch(batch):
return [model.encode(sentence, show_progress_bar=True) for sentence in batch]
def embed(sentence):
return model.encode(sentence)
# Ensure the LanceDB directory does not exist already to avoid errors
lancedb_dir = ".lancedb"
if not os.path.exists(lancedb_dir):
os.mkdir(lancedb_dir)
uri = lancedb_dir
db = lancedb.connect(uri)
# Prepare metadata for embedding
metadata = [[i, chunks_preview_alt] for i, chunks_preview_alt in enumerate(chunks_preview_alt)]
doc_df = pd.DataFrame(metadata, columns=["chunk", "text"])
# Embed the documents
data = with_embeddings(embed_batch, doc_df)
# LanceDB operations
# if not db.has_table("pdf_data"):
db.create_table("pdf_data", data=data)
table = db.open_table("pdf_data")
table.add(data=data)
# Note: Adjust the 'create_table' and 'open_table' to match your dataset/table names
message = "What are the side effects of doxycycline for treating Acne?"
results = table.search(embed(message)).limit(5).to_pandas()
#print(results.head())
message = "What are the side effects of doxycycline for treating Acne?"
results = table.search(embed(message)).limit(5).to_pandas()
#print(results.head())
# Assuming the setup for embeddings, LanceDB, and the PromptTemplate are already in place
# Assuming drug_names are retrieved from the prescriptions field
# You should parse the prescriptions field to extract the drug names
drug_names = prescriptions.split(',') # This is an example, the actual extraction depends on how the prescriptions are entered
disease = disease # Replace this with the actual method of getting the disease from the user
def rag_answer_drug_side_effects(name, drug_names, disease):
# Formulate a question related to drug side effects
message = f"What are the potential side effects of using {drug_names} for treating {disease}? Please provide a list of side effects specific to the use of these drugs in the context of the mentioned disease of {name} person."
# Search the database for relevant context
results = table.search(embed(message)).limit(10).to_pandas() # Adjust based on the correct API call
results.sort_values(by=['_distance'], inplace=True, ascending=True)
context = results['text'].iloc[0] # Use the most relevant document
# Define the prompt template
template = """### Instruction:
Start with Hi, {name}. Then give a compassionate answer in bullet points and list.
Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease
"
### Input:
Context: {context}
Question: {question}
### Response:
"""
# Augment the prompt with the retrieved context
prompt = template.format(context=context, question=message)
# Get a response
result = pg.Completion.create(
model="Neural-Chat-7B",
prompt = prompt
)
# # Here you would call your LLM or any other model to generate an answer based on the prompt
# # Since we cannot execute dynamic model calls in this environment, we'll simulate a response
# simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource."
return result['choices'][0]['text']
def rag_answer_drug_benfit_effects(name, drug_names, disease):
# Formulate a question related to drug side effects
message = f"What are the potential benefits of using {drug_names} for treating {disease}? Please provide a list of benefits specific to the use of these drugs in the context of the mentioned disease of {name} person."
# Search the database for relevant context
results = table.search(embed(message)).limit(10).to_pandas() # Adjust based on the correct API call
results.sort_values(by=['_distance'], inplace=True, ascending=True)
context = results['text'].iloc[0] # Use the most relevant document
# Define the prompt template
template = """### Instruction:
Start with Hi, {name}. Then give a compassionate answer in bullet points and list.
Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease
"
### Input:
Context: {context}
Question: {question}
### Response:
"""
# Augment the prompt with the retrieved context
prompt = template.format(context=context, question=message)
# Get a response
result = pg.Completion.create(
model="Neural-Chat-7B",
prompt = prompt
)
# # Here you would call your LLM or any other model to generate an answer based on the prompt
# # Since we cannot execute dynamic model calls in this environment, we'll simulate a response
# simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource."
return result['choices'][0]['text']
# When this button is clicked, it will return True
if st.button('Predict Drug Effects'):
# Call your processing functions here
# For example:
side_effects, benefits = rag_answer_drug_side_effects(name, drug_names, disease), rag_answer_drug_benfit_effects(name,drug_names,disease )
# You will need to define the process_input function to process these inputs
# When this button is clicked, it will return True
if submit_button:
# Call your processing functions here
# Make sure to validate input and handle errors/exceptions as necessary
try:
side_effects_response = rag_answer_drug_side_effects(name, drug_names, disease)
benefits_response = rag_answer_drug_benfit_effects(name, drug_names, disease)
st.write("Side Effects:", side_effects_response)
st.write("Benefits:", benefits_response)
except Exception as e:
st.error(f"An error occurred: {e}")
from huggingface_hub import notebook_login, Repository
notebook_login()
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Define the path to the checkpoint
checkpoint_path = r"filius-Dei/CiPE"
# # Load the model
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
# # Load the tokenizer
# tokenzier = AutoTokenzier.from_pretrained("distilbert-base-uncased")
# # # Define the path to the checkpoint
# # checkpoint_path = r'https://huggingface.co/filius-Dei/CiPE'
# # # Correct format for repo_id
# # repo_id = "filius-Dei/CiPE"
# # model = AutoModelForSequenceClassification.from_pretrained(repo_id)
# # # Load the tokenizer
# # tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")