Spaces:

b289zhan
/

OntoChat

Sleeping

App Files Files Community

Bohui Zhang commited on Feb 14, 2024

Commit

799674c

1 Parent(s): 5383218

Deploy the first version

Browse files

Files changed (11) hide show

README.md +21 -2
app.py +133 -0
ontochat/__init__.py +0 -0
ontochat/__pycache__/__init__.cpython-311.pyc +0 -0
ontochat/__pycache__/analysis.cpython-311.pyc +0 -0
ontochat/__pycache__/chatbot.cpython-311.pyc +0 -0
ontochat/__pycache__/functions.cpython-311.pyc +0 -0
ontochat/analysis.py +274 -0
ontochat/chatbot.py +43 -0
ontochat/functions.py +88 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Ontochat
-emoji: 🐠
 colorFrom: yellow
 colorTo: gray
 sdk: gradio
@@ -10,4 +10,23 @@ pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OntoChat
+emoji: 📙
 colorFrom: yellow
 colorTo: gray
 sdk: gradio
 license: apache-2.0
 ---
+# OntoChat
+We introduce **OntoChat**, a framework for conversational ontology engineering that supports requirement elicitation,
+analysis, and testing. By interacting with a conversational agent, users can steer the creation of use cases and the
+extraction of competency questions, while receiving computational support to analyse the overall requirements and test
+early versions of the resulting ontologies.
+## Deploy
+If you would like to deploy this demo locally,
+1. Create a python environment and install the requirements using `pip install -r requirements.txt`.
+2. Run `app.py`.
+## TODO
+- Hosting in Hugging Face Space
+- Add ontology testing
+- Add the evaluation panel (?)
+- Optimize clustering visualization
+- Adjust flagging
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import gradio as gr
+from ontochat.functions import *
+with gr.Blocks() as user_story_interface:
+    gr.Markdown(
+        """# OntoChat Hello! I am OntoChat, your conversational ontology engineering assistant, to help you generate
+        user stories, elicit requirements, and extract and analyze competency questions. In ontology engineering,
+        a user story contains all the requirements from the perspective of an end user of the ontology. It is a way
+        of capturing what a user needs to achieve with the ontology while also providing context and value. This demo
+        will guide you step-by-step to create a user story and generate competency questions from it. Once you are
+        ready, start inputting your persona, objective (goal), and sample data and chat with the chatbot. Once you
+        find the generated user story satisfactory, please copy the generated user story and go to the next step (
+        tab)."""
+    )
+    with gr.Row():
+        with gr.Column():
+            api_key = gr.Textbox(
+                label="OpenAI API Key",
+                info="Please input your OpenAI API Key if you don't have it set up on your own machine. Please note "
+                     "that the key will only be used for this demo and will not be uploaded or used anywhere else."
+            )
+            persona = gr.Textbox(
+                label="Persona",
+                placeholder="Please input the persona of the user, including the name, occupations, skills, interests.",
+                info="Example: The user, Mark, is an experienced musicologist. He's an expert in western music, "
+                     "and plays piano and guitar."
+            )
+            goal = gr.Textbox(
+                label="Goal",
+                placeholder="Please input the goal of the user and any specific issues faced.",
+                info="Example: The goal of the user is to analyse analogies and simmetries between music scores, "
+                     "with a particular focus on harmony and the lyrics of the music piece."
+            )
+            sample_data = gr.Textbox(
+                label="Sample of Data",
+                placeholder="Please input a sample of data.",
+                info="Example: An example of data would be: - 'Let it be' by 'The Beatles' has a sequence of chords "
+                     "composed by 'F, Amin, F' that is recurring every time the lyrics say 'Let it be'; - The lyrics "
+                     "of 'Running with the Devil' by 'Van Halen' have a recurring chord sequence for the chorus and a "
+                     "recurring chord sequence for the bridge."
+            )
+            generate_btn = gr.Button(value="Generate")
+            user_story_chatbot = gr.Chatbot(
+            )
+            chatbot_input = gr.Textbox(
+                placeholder="Please tell me what improvements I should make to the user story :)"
+            )
+        user_story = gr.TextArea(
+            label="User story",
+            interactive=True
+        )
+    generate_btn.click(
+        fn=user_story_init_generator,
+        inputs=[
+            api_key, persona, goal, sample_data
+        ],
+        outputs=[
+            user_story, user_story_chatbot
+        ]
+    )
+    chatbot_input.submit(
+        fn=user_story_generator,
+        inputs=[chatbot_input, user_story_chatbot],
+        outputs=[user_story, user_story_chatbot]
+    )
+cq_interface = gr.Interface(
+    fn=cq_generator,
+    inputs=[
+        gr.Textbox(
+            label="User story",
+            info="Please copy the previously generated user story and paste it here. You can also modify the user "
+                 "story before submitting it."
+        ),
+        gr.Slider(
+            minimum=5,
+            maximum=50,
+            step=1,
+            label="Number of competency questions",
+            info="Please select the number of competency questions you want to generate."
+        )
+    ],
+    outputs=[
+        gr.Textbox(label="Competency questions")
+    ],
+    title="OntoChat",
+)
+clustering_interface = gr.Interface(
+    fn=clustering_generator,
+    inputs=[
+        gr.Textbox(
+            label="Competency questions",
+            info="Please copy the previously generated competency questions and paste it here. You can also modify "
+                 "the questions before submitting them."
+        ),
+        gr.Dropdown(
+            choices=["Agglomerative clustering", "HDBSCAN", "LLM clustering"],
+            label="Clustering method",
+            info="Please select the clustering method."
+        ),
+        gr.Slider(
+            minimum=2,
+            maximum=50,
+            step=1,
+            label="Number of clusters",
+            info="Please select the number of clusters you want to generate. Please note that for HDBSCAN, this value "
+                 "is used as the minimum size of a cluster. And please do not input a number that exceeds the total "
+                 "number of competency questions."
+        )
+    ],
+    outputs=[
+        gr.Image(label="Visualization"),
+        gr.Code(
+            language='json',
+            label="Competency Question clusters"
+        )
+    ],
+    title="OntoChat",
+)
+demo = gr.TabbedInterface(
+    [user_story_interface, cq_interface, clustering_interface],
+    ["User Story Generation", "Competency Question Extraction", "Competency Question Analysis"]
+)
+if __name__ == "__main__":
+    # demo.launch(share=True)
+    demo.launch()

ontochat/__init__.py ADDED Viewed

File without changes

ontochat/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (155 Bytes). View file

ontochat/__pycache__/analysis.cpython-311.pyc ADDED Viewed

Binary file (12.2 kB). View file

ontochat/__pycache__/chatbot.cpython-311.pyc ADDED Viewed

Binary file (2.12 kB). View file

ontochat/__pycache__/functions.cpython-311.pyc ADDED Viewed

Binary file (3.8 kB). View file

ontochat/analysis.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+Competency questions analysis functions
+Partially inherited from [idea](https://github.com/polifonia-project/idea)
+"""
+import ast
+import io
+import re
+from collections import defaultdict
+import numpy as np
+from PIL import Image
+from matplotlib import pyplot as plt
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import AgglomerativeClustering, HDBSCAN
+from scipy.cluster.hierarchy import dendrogram
+from ontochat.chatbot import chat_completion
+def preprocess_competency_questions(cqs):
+    # preprocess competency questions: string -> list of strings
+    cqs = cqs.split("\n")
+    # # keep index
+    # cqs = [re.split(r'\.\s', cq, 1) for cq in cqs]
+    # cqs = [{cq[0]: cq[1]} for cq in cqs]
+    cqs = [re.split(r'\.\s', cq, 1)[1] for cq in cqs]
+    # clean
+    cleaned_cqs = []
+    for q in cqs:  # FIXME to move
+        # Collapse complex questions in a sentence
+        q = q.replace("\n", "; ")
+        # Remove tabular occurrences for metadata
+        q = q.replace("\t", " ")
+        # Collapse multiple empty spaces
+        q = re.sub(r"[ ]+", " ", q)
+        # Discard inconsistent punctuation
+        q = re.sub(r";[ ]*;", ";", q)
+        cleaned_cqs.append(q)
+    return cleaned_cqs
+def compute_embeddings(cqs, model="all-MiniLM-L6-v2", device="cpu"):
+    """
+    Compute sentence-level embeddings of competency questions
+    :param cqs:
+    :param model:
+    :param device:
+    :return:
+    """
+    cleaned_cqs = preprocess_competency_questions(cqs)
+    model = SentenceTransformer(model, device=device)
+    embeddings = model.encode(cleaned_cqs)
+    # Normalisation of CQ embeddings to unit length
+    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+    return cleaned_cqs, embeddings
+def agglomerative_clustering(cqs, embeddings, n_clusters=None, metric="euclidean", distance_threshold=None):
+    """
+    :param cqs:
+    :param embeddings:
+    :param n_clusters:
+    :param metric:
+    :param distance_threshold:
+    :return:
+    """
+    clustering_model = AgglomerativeClustering(
+        n_clusters=n_clusters,
+        metric=metric,
+        distance_threshold=distance_threshold,
+        compute_distances=True
+    )
+    clustering_model.fit(embeddings)
+    cluster_assignment = clustering_model.labels_
+    clustered_cqs = defaultdict(list)
+    for sentence_id, cluster_id in enumerate(cluster_assignment):
+        clustered_cqs[str(cluster_id)].append(cqs[sentence_id])
+    pil_image = plot_dendrogram(
+        clustering_model,
+        orientation='right',
+        labels=list(range(1, len(cqs) + 1)),
+        # labels=cqs,
+        truncate_mode=None,
+        # p=3,
+        show_leaf_counts=False,
+    )
+    return clustered_cqs, pil_image
+def plot_dendrogram(model, **kwargs):
+    """ Create linkage matrix and then plot the dendrogram
+    source: https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
+    :param model:
+    :param kwargs:
+    :return:
+    """
+    # create the counts of samples under each node
+    counts = np.zeros(model.children_.shape[0])
+    n_samples = len(model.labels_)
+    for i, merge in enumerate(model.children_):
+        current_count = 0
+        for child_idx in merge:
+            if child_idx < n_samples:
+                current_count += 1  # leaf node
+            else:
+                current_count += counts[child_idx - n_samples]
+        counts[i] = current_count
+    linkage_matrix = np.column_stack(
+        [model.children_, model.distances_, counts]
+    ).astype(float)
+    # Plot the corresponding dendrogram
+    plt.tight_layout()
+    # plt.figure(figsize=(40, 20))
+    dendrogram(linkage_matrix, **kwargs)
+    # plt.subplots_adjust(left=0.25, right=1.025, top=0.9, bottom=0.075)
+    # plt.savefig(figsave_path)
+    # plt.show()
+    # convert the figure into a PIL image
+    fig = plt.gcf()
+    buf = io.BytesIO()
+    fig.savefig(buf)
+    buf.seek(0)
+    return Image.open(buf)
+def hdbscan_clustering(cqs, embeddings, min_cluster_size=2):
+    """
+    :param cqs:
+    :param embeddings:
+    :param min_cluster_size:
+    :return:
+    """
+    clusterer = HDBSCAN(
+        min_cluster_size=min_cluster_size
+    )
+    clusterer.fit(embeddings)
+    cluster_assignment = clusterer.labels_
+    clustered_cqs = defaultdict(list)
+    for sentence_id, cluster_id in enumerate(cluster_assignment):
+        clustered_cqs[str(cluster_id)].append(cqs[sentence_id])
+    fig, axis = plt.subplots(1, 1)
+    image = plot_hdbscan_scatter(embeddings, cluster_assignment, parameters={"scale": 3, "eps": 0.9}, ax=axis)
+    return clustered_cqs, image
+def plot_hdbscan_scatter(data, labels, probabilities=None, parameters=None, ground_truth=False, ax=None):
+    """
+    source: https://scikit-learn.org/stable/auto_examples/cluster/plot_hdbscan.html
+    :param data:
+    :param labels:
+    :param probabilities:
+    :param parameters:
+    :param ground_truth:
+    :param ax:
+    :return:
+    """
+    if ax is None:
+        _, ax = plt.subplots(figsize=(10, 4))
+    labels = labels if labels is not None else np.ones(data.shape[0])
+    probabilities = probabilities if probabilities is not None else np.ones(data.shape[0])
+    # Black removed and is used for noise instead.
+    unique_labels = set(labels)
+    colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
+    # The probability of a point belonging to its labeled cluster determines
+    # the size of its marker
+    proba_map = {idx: probabilities[idx] for idx in range(len(labels))}
+    for k, col in zip(unique_labels, colors):
+        if k == -1:
+            # Black used for noise.
+            col = [0, 0, 0, 1]
+        class_index = np.where(labels == k)[0]
+        for ci in class_index:
+            ax.plot(
+                data[ci, 0],
+                data[ci, 1],
+                "x" if k == -1 else "o",
+                markerfacecolor=tuple(col),
+                markeredgecolor="k",
+                markersize=4 if k == -1 else 1 + 5 * proba_map[ci],
+            )
+    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+    preamble = "True" if ground_truth else "Estimated"
+    title = f"{preamble} number of clusters: {n_clusters_}"
+    if parameters is not None:
+        parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items())
+        title += f" | {parameters_str}"
+    ax.set_title(title)
+    plt.tight_layout()
+    fig = plt.gcf()
+    buf = io.BytesIO()
+    fig.savefig(buf)
+    buf.seek(0)
+    return Image.open(buf)
+def response_parser(response):
+    try:
+        response = ast.literal_eval(response)
+    except (ValueError, TypeError, SyntaxError):
+        response = ""
+    return response
+def llm_cq_clustering(cqs: str, n_clusters: int, paraphrase_detection=False):
+    """
+    :param cqs:
+    :param n_clusters:
+    :param paraphrase_detection:
+    :return:
+    """
+    conversation_history = [
+        {"role": "system", "content": "You are an ontology engineer."}
+    ]
+    # paraphrase detection before clustering
+    if paraphrase_detection:
+        # 1. paraphrase detection
+        prompt_1 = "Perform paraphrase detection for the following competency questions: {}. " \
+                   "Return a Python list of duplicate competency questions.".format(cqs)
+        conversation_history.append({"role": "user", "content": prompt_1})
+        response = chat_completion(conversation_history)
+        print("{} CQs remaining after paraphrase detection.".format(len(cqs) - len(response_parser(response))))
+        # 2. clustering
+        prompt_2 = f"Clustering the competency questions into {n_clusters} clusters based on their topics. " \
+                   "Keep the granularity of the topic in each cluster at a similar level. " \
+                   "Return in JSON format, such as: {'cluster 1 topic': " \
+                   "['competency question 1', 'competency question 2']}:"
+        conversation_history.append({"role": "assistant", "content": response})  # previous response
+        conversation_history.append({"role": "user", "content": prompt_2})
+        response = chat_completion(conversation_history)
+        # print("Output is: \"{}\"".format(response))
+    else:  # clustering only
+        prompt_2 = f"Given the competency questions: {cqs}, clustering them into {n_clusters} clusters based on the topics."
+        prompt_2 += "Keep the granularity of the topic in each cluster at a similar level. " \
+                    "Return in JSON format, such as: {'cluster 1 topic': " \
+                    "['competency question 1', 'competency question 2']}:"
+        conversation_history.append({"role": "user", "content": prompt_2})
+        response = chat_completion(conversation_history)
+        # print("Output is: \"{}\"".format(response))
+    # # 3. assign labels
+    # prompt_2 = "Clustering the competency questions based on their topics. Return in JSON format, " \
+    #            "such as: {'cluster 1 topic': ['competency question 1', 'competency question 2']}:"
+    # conversation_history.append({"role": "assistant", "content": response})  # previous response
+    # conversation_history.append({"role": "user", "content": prompt_2})
+    # response = chat_completion(conversation_history)
+    # response = response.choices[0].message.content
+    # print("Output is: \"{}\"".format(response))
+    return response_parser(response), Image.new("RGB", (640, 480), (255, 255, 255))

ontochat/chatbot.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from openai import OpenAI
+client = OpenAI()
+MODEL_NAME = "gpt-3.5-turbo"
+TEMPERATURE = 0
+SEED = 1234
+def chat_completion(messages):
+    completion = client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        seed=SEED,
+        temperature=TEMPERATURE,
+    )
+    return completion.choices[0].message.content
+def build_history(messages):
+    """
+    convert OpenAI client messages to gradio.Chatbot history
+    :param messages:
+    :return:
+    """
+    message_list = [None,]
+    for item in messages:
+        message_list.append(item["content"])
+    history = [[message_list[i], message_list[i+1]] for i in range(0, len(message_list), 2)]
+    return history
+def build_messages(history):
+    """
+    convert gardio.Chatbot history to OpenAI client messages
+    :param history:
+    :return:
+    """
+    messages = list()
+    for item in history:
+        messages.append({"role": "user", "content": item[0]})
+        messages.append({"role": "system", "content": item[1]})
+    return messages[1:]

ontochat/functions.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Interface functions
+"""
+import json
+import os
+from ontochat.chatbot import chat_completion, build_history, build_messages
+from ontochat.analysis import compute_embeddings, agglomerative_clustering, hdbscan_clustering, llm_cq_clustering
+def user_story_init_generator(api_key, persona, goal, sample_data):
+    if os.environ.get("OPENAI_API_KEY") is None:
+        # openai.api_key = api_key
+        os.environ["OPENAI_API_KEY"] = api_key
+    messages = [{
+        "role": "system",
+        "content": "I am a conversational ontology engineering assistant, to help the user generate user stories, "
+                   "elicit requirements, and extract and analyze competency questions. In ontology engineering, "
+                   "a user story contains all the requirements from the perspective of an end user of the ontology. "
+                   "It is a way of capturing what a user needs to achieve with the ontology while also providing "
+                   "context and value. I will guide the user step-by-step to create a user story and generate "
+                   "competency questions from it."
+    }, {
+        "role": "user",
+        "content": f"The persona of the user is {persona}. The goal of the user is {goal}. A sampple of data is "
+                   f"{sample_data}. Write a user story for the ontology that fit into the information provided."
+    }]
+    bot_message = chat_completion(messages)
+    messages.append({
+        "role": "system",
+        "content": bot_message
+    })
+    history = build_history(messages)
+    return bot_message, history
+def user_story_generator(message, history):
+    """
+    :param message:
+    :param history:
+    :return:
+    """
+    messages = build_messages(history)
+    bot_message = chat_completion(messages)
+    history.append((message, bot_message))
+    return bot_message, history
+def cq_generator(messages, numbers):
+    """
+    :param messages:
+    :param numbers:
+    :return:
+    """
+    messages = [
+        {
+            "role": "system",
+            "content": "You are an ontology engineer."
+        }, {
+            "role": "user",
+            "content": f"Please generate {numbers} competency questions based on the user story: {messages}"
+        }  # TODO: format constraint
+    ]
+    response = chat_completion(messages)
+    return response
+def clustering_generator(cqs, cluster_method, n_clusters):
+    """
+    :param cqs:
+    :param cluster_method:
+    :param n_clusters:
+    :return:
+    """
+    cqs, cq_embeddings = compute_embeddings(cqs)
+    if cluster_method == "Agglomerative clustering":
+        cq_clusters, cluster_image = agglomerative_clustering(cqs, cq_embeddings, n_clusters)
+    elif cluster_method == "HDBSCAN":
+        cq_clusters, cluster_image = hdbscan_clustering(cqs, cq_embeddings, n_clusters)
+    else:  # cluster_method == "LLM clustering"
+        cq_clusters, cluster_image = llm_cq_clustering(cqs, n_clusters)
+    return cluster_image, json.dumps(cq_clusters, indent=4)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+openai
+gradio
+scikit-learn
+sentence-transformers