Spaces:
Runtime error
Runtime error
| from typing import Any, Dict, List | |
| from langchain_core.messages import AnyMessage, AIMessage, HumanMessage | |
| def get_research_topic(messages: List[AnyMessage]) -> str: | |
| """ | |
| Get the research topic from the messages. | |
| """ | |
| # check if request has a history and combine the messages into a single string | |
| if len(messages) == 1: | |
| research_topic = messages[-1].content | |
| else: | |
| research_topic = "" | |
| for message in messages: | |
| if isinstance(message, HumanMessage): | |
| research_topic += f"User: {message.content}\n" | |
| elif isinstance(message, AIMessage): | |
| research_topic += f"Assistant: {message.content}\n" | |
| return research_topic | |
| def resolve_urls(urls_to_resolve: List[Any], id: int) -> Dict[str, str]: | |
| """ | |
| Create a map of the vertex ai search urls (very long) to a short url with a unique id for each url. | |
| Ensures each original URL gets a consistent shortened form while maintaining uniqueness. | |
| """ | |
| prefix = f"https://vertexaisearch.cloud.google.com/id/" | |
| urls = [site.web.uri for site in urls_to_resolve] | |
| # Create a dictionary that maps each unique URL to its first occurrence index | |
| resolved_map = {} | |
| for idx, url in enumerate(urls): | |
| if url not in resolved_map: | |
| resolved_map[url] = f"{prefix}{id}-{idx}" | |
| return resolved_map | |
| def insert_citation_markers(text, citations_list): | |
| """ | |
| Inserts citation markers into a text string based on start and end indices. | |
| Args: | |
| text (str): The original text string. | |
| citations_list (list): A list of dictionaries, where each dictionary | |
| contains 'start_index', 'end_index', and | |
| 'segment_string' (the marker to insert). | |
| Indices are assumed to be for the original text. | |
| Returns: | |
| str: The text with citation markers inserted. | |
| """ | |
| # Sort citations by end_index in descending order. | |
| # If end_index is the same, secondary sort by start_index descending. | |
| # This ensures that insertions at the end of the string don't affect | |
| # the indices of earlier parts of the string that still need to be processed. | |
| sorted_citations = sorted( | |
| citations_list, key=lambda c: (c["end_index"], c["start_index"]), reverse=True | |
| ) | |
| modified_text = text | |
| for citation_info in sorted_citations: | |
| # These indices refer to positions in the *original* text, | |
| # but since we iterate from the end, they remain valid for insertion | |
| # relative to the parts of the string already processed. | |
| end_idx = citation_info["end_index"] | |
| marker_to_insert = "" | |
| for segment in citation_info["segments"]: | |
| marker_to_insert += f" [{segment['label']}]({segment['short_url']})" | |
| # Insert the citation marker at the original end_idx position | |
| modified_text = ( | |
| modified_text[:end_idx] + marker_to_insert + modified_text[end_idx:] | |
| ) | |
| return modified_text | |
| def get_citations(response, resolved_urls_map): | |
| """ | |
| Extracts and formats citation information from a Gemini model's response. | |
| This function processes the grounding metadata provided in the response to | |
| construct a list of citation objects. Each citation object includes the | |
| start and end indices of the text segment it refers to, and a string | |
| containing formatted markdown links to the supporting web chunks. | |
| Args: | |
| response: The response object from the Gemini model, expected to have | |
| a structure including `candidates[0].grounding_metadata`. | |
| It also relies on a `resolved_map` being available in its | |
| scope to map chunk URIs to resolved URLs. | |
| Returns: | |
| list: A list of dictionaries, where each dictionary represents a citation | |
| and has the following keys: | |
| - "start_index" (int): The starting character index of the cited | |
| segment in the original text. Defaults to 0 | |
| if not specified. | |
| - "end_index" (int): The character index immediately after the | |
| end of the cited segment (exclusive). | |
| - "segments" (list[str]): A list of individual markdown-formatted | |
| links for each grounding chunk. | |
| - "segment_string" (str): A concatenated string of all markdown- | |
| formatted links for the citation. | |
| Returns an empty list if no valid candidates or grounding supports | |
| are found, or if essential data is missing. | |
| """ | |
| citations = [] | |
| # Ensure response and necessary nested structures are present | |
| if not response or not response.candidates: | |
| return citations | |
| candidate = response.candidates[0] | |
| if ( | |
| not hasattr(candidate, "grounding_metadata") | |
| or not candidate.grounding_metadata | |
| or not hasattr(candidate.grounding_metadata, "grounding_supports") | |
| ): | |
| return citations | |
| for support in candidate.grounding_metadata.grounding_supports: | |
| citation = {} | |
| # Ensure segment information is present | |
| if not hasattr(support, "segment") or support.segment is None: | |
| continue # Skip this support if segment info is missing | |
| start_index = ( | |
| support.segment.start_index | |
| if support.segment.start_index is not None | |
| else 0 | |
| ) | |
| # Ensure end_index is present to form a valid segment | |
| if support.segment.end_index is None: | |
| continue # Skip if end_index is missing, as it's crucial | |
| # Add 1 to end_index to make it an exclusive end for slicing/range purposes | |
| # (assuming the API provides an inclusive end_index) | |
| citation["start_index"] = start_index | |
| citation["end_index"] = support.segment.end_index | |
| citation["segments"] = [] | |
| if ( | |
| hasattr(support, "grounding_chunk_indices") | |
| and support.grounding_chunk_indices | |
| ): | |
| for ind in support.grounding_chunk_indices: | |
| try: | |
| chunk = candidate.grounding_metadata.grounding_chunks[ind] | |
| resolved_url = resolved_urls_map.get(chunk.web.uri, None) | |
| citation["segments"].append( | |
| { | |
| "label": chunk.web.title.split(".")[:-1][0], | |
| "short_url": resolved_url, | |
| "value": chunk.web.uri, | |
| } | |
| ) | |
| except (IndexError, AttributeError, NameError): | |
| # Handle cases where chunk, web, uri, or resolved_map might be problematic | |
| # For simplicity, we'll just skip adding this particular segment link | |
| # In a production system, you might want to log this. | |
| pass | |
| citations.append(citation) | |
| return citations | |