Bohui Zhang
commited on
Commit
Β·
799674c
1
Parent(s):
5383218
Deploy the first version
Browse files- README.md +21 -2
- app.py +133 -0
- ontochat/__init__.py +0 -0
- ontochat/__pycache__/__init__.cpython-311.pyc +0 -0
- ontochat/__pycache__/analysis.cpython-311.pyc +0 -0
- ontochat/__pycache__/chatbot.cpython-311.pyc +0 -0
- ontochat/__pycache__/functions.cpython-311.pyc +0 -0
- ontochat/analysis.py +274 -0
- ontochat/chatbot.py +43 -0
- ontochat/functions.py +88 -0
- requirements.txt +4 -0
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
|
@@ -10,4 +10,23 @@ pinned: false
|
|
| 10 |
license: apache-2.0
|
| 11 |
---
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: OntoChat
|
| 3 |
+
emoji: π
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
|
|
|
| 10 |
license: apache-2.0
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# OntoChat
|
| 14 |
+
|
| 15 |
+
We introduce **OntoChat**, a framework for conversational ontology engineering that supports requirement elicitation,
|
| 16 |
+
analysis, and testing. By interacting with a conversational agent, users can steer the creation of use cases and the
|
| 17 |
+
extraction of competency questions, while receiving computational support to analyse the overall requirements and test
|
| 18 |
+
early versions of the resulting ontologies.
|
| 19 |
+
|
| 20 |
+
## Deploy
|
| 21 |
+
If you would like to deploy this demo locally,
|
| 22 |
+
1. Create a python environment and install the requirements using `pip install -r requirements.txt`.
|
| 23 |
+
2. Run `app.py`.
|
| 24 |
+
|
| 25 |
+
## TODO
|
| 26 |
+
- Hosting in Hugging Face Space
|
| 27 |
+
- Add ontology testing
|
| 28 |
+
- Add the evaluation panel (?)
|
| 29 |
+
- Optimize clustering visualization
|
| 30 |
+
- Adjust flagging
|
| 31 |
+
|
| 32 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from ontochat.functions import *
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
with gr.Blocks() as user_story_interface:
|
| 7 |
+
gr.Markdown(
|
| 8 |
+
"""# OntoChat Hello! I am OntoChat, your conversational ontology engineering assistant, to help you generate
|
| 9 |
+
user stories, elicit requirements, and extract and analyze competency questions. In ontology engineering,
|
| 10 |
+
a user story contains all the requirements from the perspective of an end user of the ontology. It is a way
|
| 11 |
+
of capturing what a user needs to achieve with the ontology while also providing context and value. This demo
|
| 12 |
+
will guide you step-by-step to create a user story and generate competency questions from it. Once you are
|
| 13 |
+
ready, start inputting your persona, objective (goal), and sample data and chat with the chatbot. Once you
|
| 14 |
+
find the generated user story satisfactory, please copy the generated user story and go to the next step (
|
| 15 |
+
tab)."""
|
| 16 |
+
)
|
| 17 |
+
with gr.Row():
|
| 18 |
+
with gr.Column():
|
| 19 |
+
api_key = gr.Textbox(
|
| 20 |
+
label="OpenAI API Key",
|
| 21 |
+
info="Please input your OpenAI API Key if you don't have it set up on your own machine. Please note "
|
| 22 |
+
"that the key will only be used for this demo and will not be uploaded or used anywhere else."
|
| 23 |
+
)
|
| 24 |
+
persona = gr.Textbox(
|
| 25 |
+
label="Persona",
|
| 26 |
+
placeholder="Please input the persona of the user, including the name, occupations, skills, interests.",
|
| 27 |
+
info="Example: The user, Mark, is an experienced musicologist. He's an expert in western music, "
|
| 28 |
+
"and plays piano and guitar."
|
| 29 |
+
)
|
| 30 |
+
goal = gr.Textbox(
|
| 31 |
+
label="Goal",
|
| 32 |
+
placeholder="Please input the goal of the user and any specific issues faced.",
|
| 33 |
+
info="Example: The goal of the user is to analyse analogies and simmetries between music scores, "
|
| 34 |
+
"with a particular focus on harmony and the lyrics of the music piece."
|
| 35 |
+
)
|
| 36 |
+
sample_data = gr.Textbox(
|
| 37 |
+
label="Sample of Data",
|
| 38 |
+
placeholder="Please input a sample of data.",
|
| 39 |
+
info="Example: An example of data would be: - 'Let it be' by 'The Beatles' has a sequence of chords "
|
| 40 |
+
"composed by 'F, Amin, F' that is recurring every time the lyrics say 'Let it be'; - The lyrics "
|
| 41 |
+
"of 'Running with the Devil' by 'Van Halen' have a recurring chord sequence for the chorus and a "
|
| 42 |
+
"recurring chord sequence for the bridge."
|
| 43 |
+
)
|
| 44 |
+
generate_btn = gr.Button(value="Generate")
|
| 45 |
+
user_story_chatbot = gr.Chatbot(
|
| 46 |
+
|
| 47 |
+
)
|
| 48 |
+
chatbot_input = gr.Textbox(
|
| 49 |
+
placeholder="Please tell me what improvements I should make to the user story :)"
|
| 50 |
+
)
|
| 51 |
+
user_story = gr.TextArea(
|
| 52 |
+
label="User story",
|
| 53 |
+
interactive=True
|
| 54 |
+
)
|
| 55 |
+
generate_btn.click(
|
| 56 |
+
fn=user_story_init_generator,
|
| 57 |
+
inputs=[
|
| 58 |
+
api_key, persona, goal, sample_data
|
| 59 |
+
],
|
| 60 |
+
outputs=[
|
| 61 |
+
user_story, user_story_chatbot
|
| 62 |
+
]
|
| 63 |
+
)
|
| 64 |
+
chatbot_input.submit(
|
| 65 |
+
fn=user_story_generator,
|
| 66 |
+
inputs=[chatbot_input, user_story_chatbot],
|
| 67 |
+
outputs=[user_story, user_story_chatbot]
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
cq_interface = gr.Interface(
|
| 71 |
+
fn=cq_generator,
|
| 72 |
+
inputs=[
|
| 73 |
+
gr.Textbox(
|
| 74 |
+
label="User story",
|
| 75 |
+
info="Please copy the previously generated user story and paste it here. You can also modify the user "
|
| 76 |
+
"story before submitting it."
|
| 77 |
+
),
|
| 78 |
+
gr.Slider(
|
| 79 |
+
minimum=5,
|
| 80 |
+
maximum=50,
|
| 81 |
+
step=1,
|
| 82 |
+
label="Number of competency questions",
|
| 83 |
+
info="Please select the number of competency questions you want to generate."
|
| 84 |
+
)
|
| 85 |
+
],
|
| 86 |
+
outputs=[
|
| 87 |
+
gr.Textbox(label="Competency questions")
|
| 88 |
+
],
|
| 89 |
+
title="OntoChat",
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
clustering_interface = gr.Interface(
|
| 93 |
+
fn=clustering_generator,
|
| 94 |
+
inputs=[
|
| 95 |
+
gr.Textbox(
|
| 96 |
+
label="Competency questions",
|
| 97 |
+
info="Please copy the previously generated competency questions and paste it here. You can also modify "
|
| 98 |
+
"the questions before submitting them."
|
| 99 |
+
),
|
| 100 |
+
gr.Dropdown(
|
| 101 |
+
choices=["Agglomerative clustering", "HDBSCAN", "LLM clustering"],
|
| 102 |
+
label="Clustering method",
|
| 103 |
+
info="Please select the clustering method."
|
| 104 |
+
),
|
| 105 |
+
gr.Slider(
|
| 106 |
+
minimum=2,
|
| 107 |
+
maximum=50,
|
| 108 |
+
step=1,
|
| 109 |
+
label="Number of clusters",
|
| 110 |
+
info="Please select the number of clusters you want to generate. Please note that for HDBSCAN, this value "
|
| 111 |
+
"is used as the minimum size of a cluster. And please do not input a number that exceeds the total "
|
| 112 |
+
"number of competency questions."
|
| 113 |
+
)
|
| 114 |
+
],
|
| 115 |
+
outputs=[
|
| 116 |
+
gr.Image(label="Visualization"),
|
| 117 |
+
gr.Code(
|
| 118 |
+
language='json',
|
| 119 |
+
label="Competency Question clusters"
|
| 120 |
+
)
|
| 121 |
+
],
|
| 122 |
+
title="OntoChat",
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
demo = gr.TabbedInterface(
|
| 126 |
+
[user_story_interface, cq_interface, clustering_interface],
|
| 127 |
+
["User Story Generation", "Competency Question Extraction", "Competency Question Analysis"]
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
# demo.launch(share=True)
|
| 133 |
+
demo.launch()
|
ontochat/__init__.py
ADDED
|
File without changes
|
ontochat/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (155 Bytes). View file
|
|
|
ontochat/__pycache__/analysis.cpython-311.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
ontochat/__pycache__/chatbot.cpython-311.pyc
ADDED
|
Binary file (2.12 kB). View file
|
|
|
ontochat/__pycache__/functions.cpython-311.pyc
ADDED
|
Binary file (3.8 kB). View file
|
|
|
ontochat/analysis.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Competency questions analysis functions
|
| 3 |
+
Partially inherited from [idea](https://github.com/polifonia-project/idea)
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import ast
|
| 7 |
+
import io
|
| 8 |
+
import re
|
| 9 |
+
from collections import defaultdict
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
from PIL import Image
|
| 14 |
+
from matplotlib import pyplot as plt
|
| 15 |
+
|
| 16 |
+
from sentence_transformers import SentenceTransformer
|
| 17 |
+
from sklearn.cluster import AgglomerativeClustering, HDBSCAN
|
| 18 |
+
from scipy.cluster.hierarchy import dendrogram
|
| 19 |
+
|
| 20 |
+
from ontochat.chatbot import chat_completion
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def preprocess_competency_questions(cqs):
|
| 24 |
+
# preprocess competency questions: string -> list of strings
|
| 25 |
+
cqs = cqs.split("\n")
|
| 26 |
+
# # keep index
|
| 27 |
+
# cqs = [re.split(r'\.\s', cq, 1) for cq in cqs]
|
| 28 |
+
# cqs = [{cq[0]: cq[1]} for cq in cqs]
|
| 29 |
+
cqs = [re.split(r'\.\s', cq, 1)[1] for cq in cqs]
|
| 30 |
+
|
| 31 |
+
# clean
|
| 32 |
+
cleaned_cqs = []
|
| 33 |
+
for q in cqs: # FIXME to move
|
| 34 |
+
# Collapse complex questions in a sentence
|
| 35 |
+
q = q.replace("\n", "; ")
|
| 36 |
+
# Remove tabular occurrences for metadata
|
| 37 |
+
q = q.replace("\t", " ")
|
| 38 |
+
# Collapse multiple empty spaces
|
| 39 |
+
q = re.sub(r"[ ]+", " ", q)
|
| 40 |
+
# Discard inconsistent punctuation
|
| 41 |
+
q = re.sub(r";[ ]*;", ";", q)
|
| 42 |
+
cleaned_cqs.append(q)
|
| 43 |
+
|
| 44 |
+
return cleaned_cqs
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def compute_embeddings(cqs, model="all-MiniLM-L6-v2", device="cpu"):
|
| 48 |
+
"""
|
| 49 |
+
Compute sentence-level embeddings of competency questions
|
| 50 |
+
|
| 51 |
+
:param cqs:
|
| 52 |
+
:param model:
|
| 53 |
+
:param device:
|
| 54 |
+
:return:
|
| 55 |
+
"""
|
| 56 |
+
cleaned_cqs = preprocess_competency_questions(cqs)
|
| 57 |
+
|
| 58 |
+
model = SentenceTransformer(model, device=device)
|
| 59 |
+
embeddings = model.encode(cleaned_cqs)
|
| 60 |
+
|
| 61 |
+
# Normalisation of CQ embeddings to unit length
|
| 62 |
+
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 63 |
+
|
| 64 |
+
return cleaned_cqs, embeddings
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def agglomerative_clustering(cqs, embeddings, n_clusters=None, metric="euclidean", distance_threshold=None):
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
:param cqs:
|
| 71 |
+
:param embeddings:
|
| 72 |
+
:param n_clusters:
|
| 73 |
+
:param metric:
|
| 74 |
+
:param distance_threshold:
|
| 75 |
+
:return:
|
| 76 |
+
"""
|
| 77 |
+
clustering_model = AgglomerativeClustering(
|
| 78 |
+
n_clusters=n_clusters,
|
| 79 |
+
metric=metric,
|
| 80 |
+
distance_threshold=distance_threshold,
|
| 81 |
+
compute_distances=True
|
| 82 |
+
)
|
| 83 |
+
clustering_model.fit(embeddings)
|
| 84 |
+
cluster_assignment = clustering_model.labels_
|
| 85 |
+
|
| 86 |
+
clustered_cqs = defaultdict(list)
|
| 87 |
+
for sentence_id, cluster_id in enumerate(cluster_assignment):
|
| 88 |
+
clustered_cqs[str(cluster_id)].append(cqs[sentence_id])
|
| 89 |
+
|
| 90 |
+
pil_image = plot_dendrogram(
|
| 91 |
+
clustering_model,
|
| 92 |
+
orientation='right',
|
| 93 |
+
labels=list(range(1, len(cqs) + 1)),
|
| 94 |
+
# labels=cqs,
|
| 95 |
+
truncate_mode=None,
|
| 96 |
+
# p=3,
|
| 97 |
+
show_leaf_counts=False,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
return clustered_cqs, pil_image
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def plot_dendrogram(model, **kwargs):
|
| 104 |
+
""" Create linkage matrix and then plot the dendrogram
|
| 105 |
+
source: https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
|
| 106 |
+
|
| 107 |
+
:param model:
|
| 108 |
+
:param kwargs:
|
| 109 |
+
:return:
|
| 110 |
+
"""
|
| 111 |
+
# create the counts of samples under each node
|
| 112 |
+
counts = np.zeros(model.children_.shape[0])
|
| 113 |
+
n_samples = len(model.labels_)
|
| 114 |
+
for i, merge in enumerate(model.children_):
|
| 115 |
+
current_count = 0
|
| 116 |
+
for child_idx in merge:
|
| 117 |
+
if child_idx < n_samples:
|
| 118 |
+
current_count += 1 # leaf node
|
| 119 |
+
else:
|
| 120 |
+
current_count += counts[child_idx - n_samples]
|
| 121 |
+
counts[i] = current_count
|
| 122 |
+
|
| 123 |
+
linkage_matrix = np.column_stack(
|
| 124 |
+
[model.children_, model.distances_, counts]
|
| 125 |
+
).astype(float)
|
| 126 |
+
|
| 127 |
+
# Plot the corresponding dendrogram
|
| 128 |
+
plt.tight_layout()
|
| 129 |
+
# plt.figure(figsize=(40, 20))
|
| 130 |
+
dendrogram(linkage_matrix, **kwargs)
|
| 131 |
+
# plt.subplots_adjust(left=0.25, right=1.025, top=0.9, bottom=0.075)
|
| 132 |
+
# plt.savefig(figsave_path)
|
| 133 |
+
# plt.show()
|
| 134 |
+
# convert the figure into a PIL image
|
| 135 |
+
fig = plt.gcf()
|
| 136 |
+
buf = io.BytesIO()
|
| 137 |
+
fig.savefig(buf)
|
| 138 |
+
buf.seek(0)
|
| 139 |
+
return Image.open(buf)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def hdbscan_clustering(cqs, embeddings, min_cluster_size=2):
|
| 143 |
+
"""
|
| 144 |
+
|
| 145 |
+
:param cqs:
|
| 146 |
+
:param embeddings:
|
| 147 |
+
:param min_cluster_size:
|
| 148 |
+
:return:
|
| 149 |
+
"""
|
| 150 |
+
clusterer = HDBSCAN(
|
| 151 |
+
min_cluster_size=min_cluster_size
|
| 152 |
+
)
|
| 153 |
+
clusterer.fit(embeddings)
|
| 154 |
+
cluster_assignment = clusterer.labels_
|
| 155 |
+
|
| 156 |
+
clustered_cqs = defaultdict(list)
|
| 157 |
+
for sentence_id, cluster_id in enumerate(cluster_assignment):
|
| 158 |
+
clustered_cqs[str(cluster_id)].append(cqs[sentence_id])
|
| 159 |
+
|
| 160 |
+
fig, axis = plt.subplots(1, 1)
|
| 161 |
+
image = plot_hdbscan_scatter(embeddings, cluster_assignment, parameters={"scale": 3, "eps": 0.9}, ax=axis)
|
| 162 |
+
return clustered_cqs, image
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def plot_hdbscan_scatter(data, labels, probabilities=None, parameters=None, ground_truth=False, ax=None):
|
| 166 |
+
"""
|
| 167 |
+
source: https://scikit-learn.org/stable/auto_examples/cluster/plot_hdbscan.html
|
| 168 |
+
|
| 169 |
+
:param data:
|
| 170 |
+
:param labels:
|
| 171 |
+
:param probabilities:
|
| 172 |
+
:param parameters:
|
| 173 |
+
:param ground_truth:
|
| 174 |
+
:param ax:
|
| 175 |
+
:return:
|
| 176 |
+
"""
|
| 177 |
+
if ax is None:
|
| 178 |
+
_, ax = plt.subplots(figsize=(10, 4))
|
| 179 |
+
labels = labels if labels is not None else np.ones(data.shape[0])
|
| 180 |
+
probabilities = probabilities if probabilities is not None else np.ones(data.shape[0])
|
| 181 |
+
# Black removed and is used for noise instead.
|
| 182 |
+
unique_labels = set(labels)
|
| 183 |
+
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
|
| 184 |
+
# The probability of a point belonging to its labeled cluster determines
|
| 185 |
+
# the size of its marker
|
| 186 |
+
proba_map = {idx: probabilities[idx] for idx in range(len(labels))}
|
| 187 |
+
for k, col in zip(unique_labels, colors):
|
| 188 |
+
if k == -1:
|
| 189 |
+
# Black used for noise.
|
| 190 |
+
col = [0, 0, 0, 1]
|
| 191 |
+
|
| 192 |
+
class_index = np.where(labels == k)[0]
|
| 193 |
+
for ci in class_index:
|
| 194 |
+
ax.plot(
|
| 195 |
+
data[ci, 0],
|
| 196 |
+
data[ci, 1],
|
| 197 |
+
"x" if k == -1 else "o",
|
| 198 |
+
markerfacecolor=tuple(col),
|
| 199 |
+
markeredgecolor="k",
|
| 200 |
+
markersize=4 if k == -1 else 1 + 5 * proba_map[ci],
|
| 201 |
+
)
|
| 202 |
+
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
|
| 203 |
+
preamble = "True" if ground_truth else "Estimated"
|
| 204 |
+
title = f"{preamble} number of clusters: {n_clusters_}"
|
| 205 |
+
if parameters is not None:
|
| 206 |
+
parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items())
|
| 207 |
+
title += f" | {parameters_str}"
|
| 208 |
+
ax.set_title(title)
|
| 209 |
+
plt.tight_layout()
|
| 210 |
+
fig = plt.gcf()
|
| 211 |
+
buf = io.BytesIO()
|
| 212 |
+
fig.savefig(buf)
|
| 213 |
+
buf.seek(0)
|
| 214 |
+
return Image.open(buf)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def response_parser(response):
|
| 218 |
+
try:
|
| 219 |
+
response = ast.literal_eval(response)
|
| 220 |
+
except (ValueError, TypeError, SyntaxError):
|
| 221 |
+
response = ""
|
| 222 |
+
return response
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def llm_cq_clustering(cqs: str, n_clusters: int, paraphrase_detection=False):
|
| 226 |
+
"""
|
| 227 |
+
|
| 228 |
+
:param cqs:
|
| 229 |
+
:param n_clusters:
|
| 230 |
+
:param paraphrase_detection:
|
| 231 |
+
:return:
|
| 232 |
+
"""
|
| 233 |
+
conversation_history = [
|
| 234 |
+
{"role": "system", "content": "You are an ontology engineer."}
|
| 235 |
+
]
|
| 236 |
+
# paraphrase detection before clustering
|
| 237 |
+
if paraphrase_detection:
|
| 238 |
+
# 1. paraphrase detection
|
| 239 |
+
prompt_1 = "Perform paraphrase detection for the following competency questions: {}. " \
|
| 240 |
+
"Return a Python list of duplicate competency questions.".format(cqs)
|
| 241 |
+
|
| 242 |
+
conversation_history.append({"role": "user", "content": prompt_1})
|
| 243 |
+
response = chat_completion(conversation_history)
|
| 244 |
+
print("{} CQs remaining after paraphrase detection.".format(len(cqs) - len(response_parser(response))))
|
| 245 |
+
|
| 246 |
+
# 2. clustering
|
| 247 |
+
prompt_2 = f"Clustering the competency questions into {n_clusters} clusters based on their topics. " \
|
| 248 |
+
"Keep the granularity of the topic in each cluster at a similar level. " \
|
| 249 |
+
"Return in JSON format, such as: {'cluster 1 topic': " \
|
| 250 |
+
"['competency question 1', 'competency question 2']}:"
|
| 251 |
+
conversation_history.append({"role": "assistant", "content": response}) # previous response
|
| 252 |
+
conversation_history.append({"role": "user", "content": prompt_2})
|
| 253 |
+
response = chat_completion(conversation_history)
|
| 254 |
+
# print("Output is: \"{}\"".format(response))
|
| 255 |
+
|
| 256 |
+
else: # clustering only
|
| 257 |
+
prompt_2 = f"Given the competency questions: {cqs}, clustering them into {n_clusters} clusters based on the topics."
|
| 258 |
+
prompt_2 += "Keep the granularity of the topic in each cluster at a similar level. " \
|
| 259 |
+
"Return in JSON format, such as: {'cluster 1 topic': " \
|
| 260 |
+
"['competency question 1', 'competency question 2']}:"
|
| 261 |
+
conversation_history.append({"role": "user", "content": prompt_2})
|
| 262 |
+
response = chat_completion(conversation_history)
|
| 263 |
+
# print("Output is: \"{}\"".format(response))
|
| 264 |
+
|
| 265 |
+
# # 3. assign labels
|
| 266 |
+
# prompt_2 = "Clustering the competency questions based on their topics. Return in JSON format, " \
|
| 267 |
+
# "such as: {'cluster 1 topic': ['competency question 1', 'competency question 2']}:"
|
| 268 |
+
# conversation_history.append({"role": "assistant", "content": response}) # previous response
|
| 269 |
+
# conversation_history.append({"role": "user", "content": prompt_2})
|
| 270 |
+
# response = chat_completion(conversation_history)
|
| 271 |
+
# response = response.choices[0].message.content
|
| 272 |
+
# print("Output is: \"{}\"".format(response))
|
| 273 |
+
|
| 274 |
+
return response_parser(response), Image.new("RGB", (640, 480), (255, 255, 255))
|
ontochat/chatbot.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from openai import OpenAI
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
client = OpenAI()
|
| 5 |
+
MODEL_NAME = "gpt-3.5-turbo"
|
| 6 |
+
TEMPERATURE = 0
|
| 7 |
+
SEED = 1234
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def chat_completion(messages):
|
| 11 |
+
completion = client.chat.completions.create(
|
| 12 |
+
model=MODEL_NAME,
|
| 13 |
+
messages=messages,
|
| 14 |
+
seed=SEED,
|
| 15 |
+
temperature=TEMPERATURE,
|
| 16 |
+
)
|
| 17 |
+
return completion.choices[0].message.content
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def build_history(messages):
|
| 21 |
+
"""
|
| 22 |
+
convert OpenAI client messages to gradio.Chatbot history
|
| 23 |
+
:param messages:
|
| 24 |
+
:return:
|
| 25 |
+
"""
|
| 26 |
+
message_list = [None,]
|
| 27 |
+
for item in messages:
|
| 28 |
+
message_list.append(item["content"])
|
| 29 |
+
history = [[message_list[i], message_list[i+1]] for i in range(0, len(message_list), 2)]
|
| 30 |
+
return history
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def build_messages(history):
|
| 34 |
+
"""
|
| 35 |
+
convert gardio.Chatbot history to OpenAI client messages
|
| 36 |
+
:param history:
|
| 37 |
+
:return:
|
| 38 |
+
"""
|
| 39 |
+
messages = list()
|
| 40 |
+
for item in history:
|
| 41 |
+
messages.append({"role": "user", "content": item[0]})
|
| 42 |
+
messages.append({"role": "system", "content": item[1]})
|
| 43 |
+
return messages[1:]
|
ontochat/functions.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Interface functions
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
from ontochat.chatbot import chat_completion, build_history, build_messages
|
| 9 |
+
from ontochat.analysis import compute_embeddings, agglomerative_clustering, hdbscan_clustering, llm_cq_clustering
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def user_story_init_generator(api_key, persona, goal, sample_data):
|
| 13 |
+
if os.environ.get("OPENAI_API_KEY") is None:
|
| 14 |
+
# openai.api_key = api_key
|
| 15 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
| 16 |
+
messages = [{
|
| 17 |
+
"role": "system",
|
| 18 |
+
"content": "I am a conversational ontology engineering assistant, to help the user generate user stories, "
|
| 19 |
+
"elicit requirements, and extract and analyze competency questions. In ontology engineering, "
|
| 20 |
+
"a user story contains all the requirements from the perspective of an end user of the ontology. "
|
| 21 |
+
"It is a way of capturing what a user needs to achieve with the ontology while also providing "
|
| 22 |
+
"context and value. I will guide the user step-by-step to create a user story and generate "
|
| 23 |
+
"competency questions from it."
|
| 24 |
+
}, {
|
| 25 |
+
"role": "user",
|
| 26 |
+
"content": f"The persona of the user is {persona}. The goal of the user is {goal}. A sampple of data is "
|
| 27 |
+
f"{sample_data}. Write a user story for the ontology that fit into the information provided."
|
| 28 |
+
}]
|
| 29 |
+
bot_message = chat_completion(messages)
|
| 30 |
+
messages.append({
|
| 31 |
+
"role": "system",
|
| 32 |
+
"content": bot_message
|
| 33 |
+
})
|
| 34 |
+
history = build_history(messages)
|
| 35 |
+
return bot_message, history
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def user_story_generator(message, history):
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
:param message:
|
| 42 |
+
:param history:
|
| 43 |
+
:return:
|
| 44 |
+
"""
|
| 45 |
+
messages = build_messages(history)
|
| 46 |
+
bot_message = chat_completion(messages)
|
| 47 |
+
history.append((message, bot_message))
|
| 48 |
+
return bot_message, history
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def cq_generator(messages, numbers):
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
:param messages:
|
| 55 |
+
:param numbers:
|
| 56 |
+
:return:
|
| 57 |
+
"""
|
| 58 |
+
messages = [
|
| 59 |
+
{
|
| 60 |
+
"role": "system",
|
| 61 |
+
"content": "You are an ontology engineer."
|
| 62 |
+
}, {
|
| 63 |
+
"role": "user",
|
| 64 |
+
"content": f"Please generate {numbers} competency questions based on the user story: {messages}"
|
| 65 |
+
} # TODO: format constraint
|
| 66 |
+
]
|
| 67 |
+
response = chat_completion(messages)
|
| 68 |
+
return response
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def clustering_generator(cqs, cluster_method, n_clusters):
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
:param cqs:
|
| 75 |
+
:param cluster_method:
|
| 76 |
+
:param n_clusters:
|
| 77 |
+
:return:
|
| 78 |
+
"""
|
| 79 |
+
cqs, cq_embeddings = compute_embeddings(cqs)
|
| 80 |
+
|
| 81 |
+
if cluster_method == "Agglomerative clustering":
|
| 82 |
+
cq_clusters, cluster_image = agglomerative_clustering(cqs, cq_embeddings, n_clusters)
|
| 83 |
+
elif cluster_method == "HDBSCAN":
|
| 84 |
+
cq_clusters, cluster_image = hdbscan_clustering(cqs, cq_embeddings, n_clusters)
|
| 85 |
+
else: # cluster_method == "LLM clustering"
|
| 86 |
+
cq_clusters, cluster_image = llm_cq_clustering(cqs, n_clusters)
|
| 87 |
+
|
| 88 |
+
return cluster_image, json.dumps(cq_clusters, indent=4)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openai
|
| 2 |
+
gradio
|
| 3 |
+
scikit-learn
|
| 4 |
+
sentence-transformers
|