ciftselcuk commited on
Commit
f0b69ef
·
0 Parent(s):

Initial deployment: GLiNER Large entity extractor with 70 labels

Browse files
Files changed (3) hide show
  1. README.md +62 -0
  2. app.py +170 -0
  3. requirements.txt +8 -0
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: T9 Oracle Entity Extractor
3
+ emoji: 🔬
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ hardware: t4-small
12
+ ---
13
+
14
+ # T9 Oracle Entity Extractor
15
+
16
+ **Zero-shot NER for Medical Device Technical Documentation**
17
+
18
+ This Space provides entity extraction using GLiNER Large (1.7GB) for technical documentation in the medical device domain.
19
+
20
+ ## Features
21
+
22
+ - **70 Entity Labels** across 10 tiers
23
+ - **Zero-shot learning** - no training required
24
+ - **Medical device focus** - optimized for endoscope equipment, parts, specifications
25
+ - **Hardware:** NVIDIA T4 GPU for fast inference
26
+
27
+ ## Entity Types
28
+
29
+ ### Tier 1: Critical Identifiers
30
+ - part_number, component_name, manufacturer, model_number
31
+
32
+ ### Tier 2: Specifications
33
+ - pressure, temperature, voltage, current, material, dimensions, flow_rate, power
34
+
35
+ ### Tier 3: Standards & Compliance
36
+ - standard_reference (ISO, ASTM, EN, IEC, ANSI), certification, compliance
37
+
38
+ ### Tier 4-10: Additional Labels
39
+ - Thread standards, geometry, documentation, operational parameters, manufacturing IDs, medical device specific, visual elements, quality & maintenance
40
+
41
+ ## API Usage
42
+
43
+ ```python
44
+ from gradio_client import Client
45
+
46
+ client = Client("YOUR_USERNAME/t9-oracle-gliner-entity-extractor")
47
+
48
+ text = "Part Number: A70002-2, Material: SS316L, Pressure: 60 psi"
49
+ result = client.predict(text, api_name="/extract")
50
+ print(result)
51
+ ```
52
+
53
+ ## Configuration
54
+
55
+ - **Model:** urchade/gliner_large-v2.1
56
+ - **GPU:** NVIDIA T4 (16GB VRAM)
57
+ - **Cost:** $0.60/hour (Persistent)
58
+ - **Max input:** 10,000 characters per request
59
+
60
+ ## Project
61
+
62
+ Part of the T9 Oracle Knowledge Base Extraction System for Auto Sink medical device documentation.
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ T9 Oracle GLiNER Entity Extractor - HF Space Deployment
3
+ Gradio API endpoint for zero-shot NER with 70 medical device labels
4
+
5
+ Deployed on: Persistent T4 GPU
6
+ Model: urchade/gliner_large-v2.1 (1.7GB)
7
+ Cost: $0.60/hour
8
+ """
9
+
10
+ import gradio as gr
11
+ import json
12
+ import logging
13
+ from typing import List, Dict
14
+ from gliner import GLiNER
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # 70 APPROVED ENTITY LABELS (from T9 configuration)
20
+ ENTITY_LABELS = [
21
+ # Tier 1: Critical Identifiers (4)
22
+ "part_number", "component_name", "manufacturer", "model_number",
23
+ # Tier 2: Specifications & Measurements (13)
24
+ "pressure", "temperature", "voltage", "current", "material",
25
+ "dimension", "weight", "volume", "flow_rate", "power",
26
+ "diameter", "length", "thickness",
27
+ # Tier 3: Standards & Compliance (4)
28
+ "standard_reference", "certification", "compliance", "safety_class",
29
+ # Tier 4: Geometry & Mechanical (11)
30
+ "thread_standard", "pipe_size", "tubing_size", "connector_type",
31
+ "surface_finish", "surface_treatment", "width", "height",
32
+ "tolerance", "hardness", "torque",
33
+ # Tier 5: Documentation (7)
34
+ "diagram_reference", "drawing_number", "procedure_number",
35
+ "test_protocol", "revision", "sku_number", "part_label",
36
+ # Tier 6: Operational Parameters (9)
37
+ "accuracy", "speed", "frequency", "resistance",
38
+ "operating_temperature", "supply_voltage", "response_time",
39
+ "duty_cycle", "operating_range",
40
+ # Tier 7: Manufacturing (8)
41
+ "operator_id", "tool_number", "gauge_id", "fixture_number",
42
+ "machine_id", "lot_number", "serial_number", "batch_id",
43
+ # Tier 8: Medical Device (7)
44
+ "medical_device", "scope_manufacturer", "channel_type",
45
+ "port_type", "hub_type", "color_code", "leak_test",
46
+ # Tier 9: Visual Elements (2)
47
+ "diagram_type", "technical_annotation",
48
+ # Tier 10: Quality & Maintenance (7)
49
+ "calibration_interval", "service_interval", "mtbf",
50
+ "warranty", "expiration_date", "production_date", "inspection_report"
51
+ ]
52
+
53
+ # Load GLiNER model (runs once on Space startup)
54
+ logger.info("Loading GLiNER Large model (1.7GB)...")
55
+ model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")
56
+ logger.info(f"✓ GLiNER loaded with {len(ENTITY_LABELS)} labels")
57
+
58
+
59
+ def extract_entities(text: str, max_length: int = 10000) -> str:
60
+ """
61
+ Extract entities from text using GLiNER zero-shot NER
62
+
63
+ Args:
64
+ text: Input text (max 10,000 characters recommended)
65
+ max_length: Maximum text length per prediction
66
+
67
+ Returns:
68
+ JSON string with extracted entities
69
+ """
70
+ if not text or not text.strip():
71
+ return json.dumps({"entities": [], "error": "Empty text provided"})
72
+
73
+ # Truncate if too long
74
+ if len(text) > max_length:
75
+ logger.warning(f"Text truncated from {len(text)} to {max_length} chars")
76
+ text = text[:max_length]
77
+
78
+ try:
79
+ # GLiNER prediction
80
+ predictions = model.predict_entities(text, ENTITY_LABELS)
81
+
82
+ # Format output
83
+ entities = []
84
+ for pred in predictions:
85
+ entities.append({
86
+ "text": pred.get("text", ""),
87
+ "label": pred.get("label", ""),
88
+ "start": pred.get("start", 0),
89
+ "end": pred.get("end", 0),
90
+ "score": float(pred.get("score", 0.0))
91
+ })
92
+
93
+ logger.info(f"Extracted {len(entities)} entities from {len(text)} chars")
94
+
95
+ return json.dumps({
96
+ "entities": entities,
97
+ "input_length": len(text),
98
+ "entity_count": len(entities),
99
+ "labels_used": len(ENTITY_LABELS)
100
+ }, indent=2)
101
+
102
+ except Exception as e:
103
+ logger.error(f"Extraction failed: {e}")
104
+ return json.dumps({"entities": [], "error": str(e)})
105
+
106
+
107
+ def batch_extract(text_batch: str) -> str:
108
+ """
109
+ Extract entities from multiple texts (newline-separated)
110
+
111
+ Args:
112
+ text_batch: Multiple texts separated by double newlines
113
+
114
+ Returns:
115
+ JSON string with results for each text
116
+ """
117
+ texts = [t.strip() for t in text_batch.split("\n\n") if t.strip()]
118
+
119
+ results = []
120
+ for i, text in enumerate(texts):
121
+ result_json = extract_entities(text)
122
+ result = json.loads(result_json)
123
+ result["text_index"] = i
124
+ results.append(result)
125
+
126
+ return json.dumps({"results": results, "batch_size": len(texts)}, indent=2)
127
+
128
+
129
+ # Create Gradio interface
130
+ demo = gr.Interface(
131
+ fn=extract_entities,
132
+ inputs=[
133
+ gr.Textbox(
134
+ lines=10,
135
+ placeholder="Enter technical text here (max 10,000 chars)...",
136
+ label="Input Text"
137
+ )
138
+ ],
139
+ outputs=gr.JSON(label="Extracted Entities"),
140
+ title="T9 Oracle Entity Extractor (GLiNER Large)",
141
+ description=f"""
142
+ **Zero-shot NER for Medical Device Technical Documentation**
143
+
144
+ Extracts **{len(ENTITY_LABELS)} entity types** across 10 tiers:
145
+ - Part numbers, dimensions, materials, standards
146
+ - Electrical specs, pressure, temperature, flow rates
147
+ - Thread standards, tolerances, surface treatments
148
+ - Medical device specific (scopes, channels, colors)
149
+ - Quality & maintenance data
150
+
151
+ **Model:** GLiNER Large v2.1 (1.7GB)
152
+ **Hardware:** NVIDIA T4 GPU (16GB VRAM)
153
+ **Max input:** 10,000 characters per request
154
+ """,
155
+ examples=[
156
+ ["Part Number: A70002-2, Material: SS316L, Pressure: 60 psi, Thread: 1/4\" NPT"],
157
+ ["Standard: ISO 1179-2, ASTM A112, Temperature: -40 to 85°C, Dimension: 6mm x 35mm"],
158
+ ["Manufacturer: Olympus, Channel: Biopsy, Color: Orange Tubing, Serial: SN-123456"]
159
+ ],
160
+ api_name="extract", # Important: enables API access
161
+ allow_flagging="never"
162
+ )
163
+
164
+ # Launch with API enabled
165
+ if __name__ == "__main__":
166
+ demo.launch(
167
+ server_name="0.0.0.0",
168
+ server_port=7860,
169
+ share=False # HF Spaces handles sharing
170
+ )
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # T9 Oracle GLiNER Entity Extractor - HF Space Dependencies
2
+ # Approved configuration from /tmp/entity_extraction_final_configuration.md
3
+
4
+ gliner==0.2.8
5
+ torch>=2.0.0
6
+ transformers>=4.30.0
7
+ gradio>=4.0.0
8
+ Pillow>=10.0.0