Upload 2 files

b534b3f verified almost 2 years ago

10.5 kB

	# -- coding: utf-8 --
	"""CiPE_Streamlit

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1jACLFXfsdWM59lrfTQGcZVsTIHBO92R8
	"""

	# Om Maa

	!pip install langchain predictionguard lancedb html2text sentence-transformers PyPDF2
	!pip install huggingface_hub
	!pip install transformers
	!pip install sentencepiece
	!pip install streamlit

	import os
	import urllib.request

	import html2text
	import predictionguard as pg
	from langchain import PromptTemplate, FewShotPromptTemplate
	from langchain.text_splitter import CharacterTextSplitter
	from sentence_transformers import SentenceTransformer
	import numpy as np
	import lancedb
	from lancedb.embeddings import with_embeddings
	import pandas as pd


	os.environ['PREDICTIONGUARD_TOKEN'] = "q1VuOjnffJ3NO2oFN8Q9m8vghYc84ld13jaqdF7E"

	# Streamlit App Initiation

	import streamlit as st

	# Replace input() with Streamlit's input widgets
	# Sidebar for inputting the name, age, gender, and ethnicity
	name = st.sidebar.text_input('Name')
	age = st.sidebar.number_input('Age', min_value=0, max_value=120, step=1)
	gender = st.sidebar.selectbox('Gender', ['Male', 'Female', 'Other'])
	ethnicity = st.sidebar.text_input('Ethnicity')



	# Main container
	with st.form(key='patient_form'):
	# Text input for procedures
	disease = st.text_area('DISEASE', height=100)

	# Text input for prescriptions (where you would get drug_names)
	prescriptions = st.text_area('PRESCRIPTIONS', height=100)

	# Text input for additional information
	additional_info = st.text_area('ADDITIONAL INFO', height=100)

	# Submit button for the form
	submit_button = st.form_submit_button(label='Predict Drug Effects')

	from PyPDF2 import PdfReader

	# Replace 'path_to_your_pdf_file.pdf' with the path to your PDF file
	pdf_path = '/content/drug_side_effects_summary_cleaned.pdf'
	reader = PdfReader(pdf_path)

	# Initialize an empty string to accumulate text
	text = ''

	# Iterate over each page in the PDF
	for page in reader.pages:
	# Extract text from the page and append it to the text string
	text += page.extract_text() + "\n"

	# Now, `text` contains the text content of the PDF. You can print it or process it further.
	print(text[:500]) # Example: print the first 500 characters to understand the structure

	import re

	# Function to clean the extracted text
	def clean_text(text):
	# Correcting unwanted line breaks and spaces
	text = re.sub(r'-\n', '', text) # Remove hyphenation
	text = re.sub(r'\n', ' ', text) # Replace new lines with space
	text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
	text = text.strip() # Remove leading and trailing spaces
	return text

	# Clean the extracted text
	cleaned_text = clean_text(text)

	# Return a portion of the cleaned text to verify the cleaning
	cleaned_text[:500]

	# Define a function to chunk text with specified size and overlap using standard Python
	def chunk_text(text, chunk_size=700, overlap=50):
	chunks = []
	start = 0
	while start < len(text):
	# If we're not at the beginning, move back 'overlap' characters for context
	if start > 0:
	start -= overlap
	end = start + chunk_size
	chunks.append(text[start:end])
	start += chunk_size
	return chunks

	# Chunk the cleaned text into smaller pieces for LLM input
	docs_alternative = chunk_text(cleaned_text, chunk_size=700, overlap=50)

	# Prepare to display the first few chunks to verify the result
	chunks_to_display_alt = 3
	chunks_preview_alt = [docs_alternative[i] for i in range(min(len(docs_alternative), chunks_to_display_alt))]

	chunks_preview_alt

	# Format the chunks to avoid prompt template conflicts
	chunks_preview_alt = [x.replace('#', '-') for x in chunks_preview_alt]

	# Embeddings setup
	name = "all-MiniLM-L12-v2"
	model = SentenceTransformer(name)

	# Embedding functions
	def embed_batch(batch):
	return [model.encode(sentence, show_progress_bar=True) for sentence in batch]

	def embed(sentence):
	return model.encode(sentence)

	# Ensure the LanceDB directory does not exist already to avoid errors
	lancedb_dir = ".lancedb"
	if not os.path.exists(lancedb_dir):
	os.mkdir(lancedb_dir)
	uri = lancedb_dir
	db = lancedb.connect(uri)

	# Prepare metadata for embedding
	metadata = [[i, chunks_preview_alt] for i, chunks_preview_alt in enumerate(chunks_preview_alt)]
	doc_df = pd.DataFrame(metadata, columns=["chunk", "text"])

	# Embed the documents
	data = with_embeddings(embed_batch, doc_df)

	# LanceDB operations
	# if not db.has_table("pdf_data"):
	db.create_table("pdf_data", data=data)
	table = db.open_table("pdf_data")
	table.add(data=data)

	# Note: Adjust the 'create_table' and 'open_table' to match your dataset/table names

	message = "What are the side effects of doxycycline for treating Acne?"
	results = table.search(embed(message)).limit(5).to_pandas()
	#print(results.head())


	message = "What are the side effects of doxycycline for treating Acne?"
	results = table.search(embed(message)).limit(5).to_pandas()
	#print(results.head())

	# Assuming the setup for embeddings, LanceDB, and the PromptTemplate are already in place

	# Assuming drug_names are retrieved from the prescriptions field
	# You should parse the prescriptions field to extract the drug names
	drug_names = prescriptions.split(',') # This is an example, the actual extraction depends on how the prescriptions are entered
	disease = disease # Replace this with the actual method of getting the disease from the user


	def rag_answer_drug_side_effects(name, drug_names, disease):
	# Formulate a question related to drug side effects
	message = f"What are the potential side effects of using {drug_names} for treating {disease}? Please provide a list of side effects specific to the use of these drugs in the context of the mentioned disease of {name} person."

	# Search the database for relevant context
	results = table.search(embed(message)).limit(10).to_pandas() # Adjust based on the correct API call
	results.sort_values(by=['_distance'], inplace=True, ascending=True)
	context = results['text'].iloc[0] # Use the most relevant document

	# Define the prompt template
	template = """### Instruction:
	Start with Hi, {name}. Then give a compassionate answer in bullet points and list.
	Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease
	"

	### Input:
	Context: {context}

	Question: {question}

	### Response:
	"""

	# Augment the prompt with the retrieved context
	prompt = template.format(context=context, question=message)

	# Get a response
	result = pg.Completion.create(
	model="Neural-Chat-7B",
	prompt = prompt
	)

	# # Here you would call your LLM or any other model to generate an answer based on the prompt
	# # Since we cannot execute dynamic model calls in this environment, we'll simulate a response
	# simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource."

	return result['choices'][0]['text']


	def rag_answer_drug_benfit_effects(name, drug_names, disease):
	# Formulate a question related to drug side effects
	message = f"What are the potential benefits of using {drug_names} for treating {disease}? Please provide a list of benefits specific to the use of these drugs in the context of the mentioned disease of {name} person."

	# Search the database for relevant context
	results = table.search(embed(message)).limit(10).to_pandas() # Adjust based on the correct API call
	results.sort_values(by=['_distance'], inplace=True, ascending=True)
	context = results['text'].iloc[0] # Use the most relevant document

	# Define the prompt template
	template = """### Instruction:
	Start with Hi, {name}. Then give a compassionate answer in bullet points and list.
	Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease
	"

	### Input:
	Context: {context}

	Question: {question}

	### Response:
	"""

	# Augment the prompt with the retrieved context
	prompt = template.format(context=context, question=message)

	# Get a response
	result = pg.Completion.create(
	model="Neural-Chat-7B",
	prompt = prompt
	)

	# # Here you would call your LLM or any other model to generate an answer based on the prompt
	# # Since we cannot execute dynamic model calls in this environment, we'll simulate a response
	# simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource."

	return result['choices'][0]['text']

	# When this button is clicked, it will return True
	if st.button('Predict Drug Effects'):
	# Call your processing functions here
	# For example:
	side_effects, benefits = rag_answer_drug_side_effects(name, drug_names, disease), rag_answer_drug_benfit_effects(name,drug_names,disease )
	# You will need to define the process_input function to process these inputs

	# When this button is clicked, it will return True
	if submit_button:
	# Call your processing functions here
	# Make sure to validate input and handle errors/exceptions as necessary
	try:
	side_effects_response = rag_answer_drug_side_effects(name, drug_names, disease)
	benefits_response = rag_answer_drug_benfit_effects(name, drug_names, disease)
	st.write("Side Effects:", side_effects_response)
	st.write("Benefits:", benefits_response)
	except Exception as e:
	st.error(f"An error occurred: {e}")

	from huggingface_hub import notebook_login, Repository

	notebook_login()

	from transformers import AutoModelForSequenceClassification, AutoTokenizer

	# Define the path to the checkpoint
	checkpoint_path = r"filius-Dei/CiPE"

	# # Load the model
	# model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

	# # Load the tokenizer
	# tokenzier = AutoTokenzier.from_pretrained("distilbert-base-uncased")

	# # # Define the path to the checkpoint
	# # checkpoint_path = r'https://huggingface.co/filius-Dei/CiPE'

	# # # Correct format for repo_id
	# # repo_id = "filius-Dei/CiPE"

	# # model = AutoModelForSequenceClassification.from_pretrained(repo_id)

	# # # Load the tokenizer
	# # tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")