Max

Running

App Files Files Community

K1Z3M1112 commited on 19 days ago

Commit

c8108cb

verified ·

1 Parent(s): b731a45

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -52

app.py CHANGED Viewed

@@ -20,9 +20,9 @@ if torch.cuda.is_available():
 # Device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-dtype = torch.float32  # 使用 float32 來避免兼容性問題
-print(f"🖥️  Device: {device} | dtype: {dtype}")
 # Lazy import (to avoid long startup if unused)
 from diffusers import (
@@ -234,24 +234,25 @@ def load_florence2():
         print("📥 Loading Microsoft/Florence-2-base...")
-        # 加載 processor
-        FLORENCE2_PROCESSOR = AutoProcessor.from_pretrained(
-            "microsoft/Florence-2-base",
-            trust_remote_code=True
-        )
-        # 使用較舊的加載方式避免兼容性問題
         FLORENCE2_MODEL = AutoModelForCausalLM.from_pretrained(
-            "microsoft/Florence-2-base",
-            torch_dtype=dtype,
             trust_remote_code=True
         ).to(device)
         print("✅ Florence-2 model loaded successfully")
         return FLORENCE2_PROCESSOR, FLORENCE2_MODEL
     except Exception as e:
         print(f"❌ Error loading Florence-2: {e}")
         return None, None
 def analyze_with_florence2(image, task_prompt):
@@ -271,8 +272,6 @@ def analyze_with_florence2(image, task_prompt):
             try:
                 if isinstance(image, np.ndarray):
                     image = Image.fromarray(image)
-                elif hasattr(image, 'shape'):  # 可能是 torch tensor
-                    image = Image.fromarray(image.cpu().numpy())
                 else:
                     return "❌ Invalid image format. Please upload a valid image."
             except Exception as e:
@@ -289,49 +288,61 @@ def analyze_with_florence2(image, task_prompt):
             new_size = (int(image.width * ratio), int(image.height * ratio))
             image = image.resize(new_size, Image.Resampling.LANCZOS)
-        # Prepare input
         try:
             inputs = processor(
                 text=task_prompt,
                 images=image,
                 return_tensors="pt"
-            ).to(device)
         except Exception as e:
             print(f"❌ Error processing image: {e}")
             return f"❌ Error processing image: {str(e)}"
-        # 檢查 inputs 是否有效
-        if inputs is None or 'pixel_values' not in inputs:
-            return "❌ Failed to process image for analysis."
-        # Generate
         try:
             generated_ids = model.generate(
                 input_ids=inputs["input_ids"],
                 pixel_values=inputs["pixel_values"],
-                max_new_tokens=512,  # 減少 token 數量以加快處理
-                num_beams=2,  # 減少 beams 以加快處理
-                early_stopping=True
             )
         except Exception as e:
             print(f"❌ Error generating text: {e}")
             return f"❌ Error during analysis: {str(e)}"
-        # Decode
         try:
             generated_text = processor.batch_decode(
                 generated_ids,
-                skip_special_tokens=True
             )[0]
         except Exception as e:
             print(f"❌ Error decoding text: {e}")
             return f"❌ Error decoding result: {str(e)}"
-        # Clean up
-        if device.type == "cuda":
-            torch.cuda.empty_cache()
-        return generated_text
     except Exception as e:
         print(f"❌ Error in Florence-2 analysis: {e}")
@@ -404,17 +415,17 @@ def get_pipeline(model_name: str, controlnet_type: str = "lineart", lora_model:
                 controlnet_model_name = get_controlnet_model(controlnet_type)
                 controlnet = ControlNetModel.from_pretrained(
                     controlnet_model_name,
-                    torch_dtype=dtype
                 ).to(device)
                 pipe = StableDiffusionXLPipeline.from_pretrained(
                     model_name,
                     controlnet=controlnet,
-                    torch_dtype=dtype,
                     safety_checker=None,
                     requires_safety_checker=False,
                     use_safetensors=True,
-                    variant="fp16" if dtype == torch.float16 else None
                 ).to(device)
             else:
                 raise ValueError(f"SDXL model {model_name} only supports limited ControlNet types: {list(SDXL_CONTROLNET_MODELS.keys())}")
@@ -423,17 +434,17 @@ def get_pipeline(model_name: str, controlnet_type: str = "lineart", lora_model:
             controlnet_model_name = get_controlnet_model(controlnet_type)
             controlnet = ControlNetModel.from_pretrained(
                 controlnet_model_name,
-                torch_dtype=dtype
             ).to(device)
             pipe = StableDiffusionControlNetPipeline.from_pretrained(
                 model_name,
                 controlnet=controlnet,
-                torch_dtype=dtype,
                 safety_checker=None,
                 requires_safety_checker=False,
                 use_safetensors=True,
-                variant="fp16" if dtype == torch.float16 else None
             ).to(device)
         # Apply LoRA if specified
@@ -540,20 +551,20 @@ def load_t2i_model(model_name: str, lora_model: str = None, lora_weight: float =
                 # Load base and refiner
                 CURRENT_T2I_PIPE = StableDiffusionXLPipeline.from_pretrained(
                     "stabilityai/stable-diffusion-xl-base-1.0",
-                    torch_dtype=dtype,
                     safety_checker=None,
                     requires_safety_checker=False,
                     use_safetensors=True,
-                    variant="fp16" if dtype == torch.float16 else None
                 ).to(device)
                 CURRENT_SDXL_REFINER = StableDiffusionXLPipeline.from_pretrained(
                     model_name,
-                    torch_dtype=dtype,
                     safety_checker=None,
                     requires_safety_checker=False,
                     use_safetensors=True,
-                    variant="fp16" if dtype == torch.float16 else None,
                     text_encoder_2=CURRENT_T2I_PIPE.text_encoder_2,
                     vae=CURRENT_T2I_PIPE.vae
                 ).to(device)
@@ -561,22 +572,22 @@ def load_t2i_model(model_name: str, lora_model: str = None, lora_weight: float =
             else:
                 CURRENT_T2I_PIPE = StableDiffusionXLPipeline.from_pretrained(
                     model_name,
-                    torch_dtype=dtype,
                     safety_checker=None,
                     requires_safety_checker=False,
                     use_safetensors=True,
-                    variant="fp16" if dtype == torch.float16 else None
                 ).to(device)
                 print(f"✅ Loaded SDXL model: {model_name}")
         else:
             # Load SD1.5 model
             CURRENT_T2I_PIPE = StableDiffusionPipeline.from_pretrained(
                 model_name,
-                torch_dtype=dtype,
                 safety_checker=None,
                 requires_safety_checker=False,
                 use_safetensors=True,
-                variant="fp16" if dtype == torch.float16 else None
             ).to(device)
             print(f"✅ Loaded SD1.5 model: {model_name}")
@@ -631,14 +642,14 @@ def load_t2i_model(model_name: str, lora_model: str = None, lora_weight: float =
             if is_sdxl_model(model_name):
                 CURRENT_T2I_PIPE = StableDiffusionXLPipeline.from_pretrained(
                     model_name,
-                    torch_dtype=dtype,
                     safety_checker=None,
                     requires_safety_checker=False
                 ).to(device)
             else:
                 CURRENT_T2I_PIPE = StableDiffusionPipeline.from_pretrained(
                     model_name,
-                    torch_dtype=dtype,
                     safety_checker=None,
                     requires_safety_checker=False
                 ).to(device)
@@ -1128,15 +1139,24 @@ with gr.Blocks(title="🎨 Advanced Image Generation Suite", theme=gr.themes.Sof
         gr.Markdown("""
         ### Microsoft Florence-2 Vision Language Model
         **Pre-trained Tasks:**
-        - `<OCR>`: Text recognition
-        - `<CAPTION>`: Image captioning
-        - `<DETAILED_CAPTION>`: Detailed caption
-        - `<MORE_DETAILED_CAPTION>`: More detailed caption
-        - `<OD>`: Object detection
         - `<OPEN_VOCABULARY_DETECTION>`: Open-vocabulary detection
         - `<REGION_PROPOSAL>`: Region proposal
-        **Note:** Upload an image and select a task to analyze it.
         """)
         with gr.Row():

 # Device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+print(f"🖥️  Device: {device} | dtype: {torch_dtype}")
 # Lazy import (to avoid long startup if unused)
 from diffusers import (
         print("📥 Loading Microsoft/Florence-2-base...")
+        # 按照官方文檔加載模型
         FLORENCE2_MODEL = AutoModelForCausalLM.from_pretrained(
+            "microsoft/Florence-2-base",
+            torch_dtype=torch_dtype,
             trust_remote_code=True
         ).to(device)
+        FLORENCE2_PROCESSOR = AutoProcessor.from_pretrained(
+            "microsoft/Florence-2-base",
+            trust_remote_code=True
+        )
         print("✅ Florence-2 model loaded successfully")
         return FLORENCE2_PROCESSOR, FLORENCE2_MODEL
     except Exception as e:
         print(f"❌ Error loading Florence-2: {e}")
+        import traceback
+        traceback.print_exc()
         return None, None
 def analyze_with_florence2(image, task_prompt):
             try:
                 if isinstance(image, np.ndarray):
                     image = Image.fromarray(image)
                 else:
                     return "❌ Invalid image format. Please upload a valid image."
             except Exception as e:
             new_size = (int(image.width * ratio), int(image.height * ratio))
             image = image.resize(new_size, Image.Resampling.LANCZOS)
+        # 按照官方文檔準備輸入
         try:
             inputs = processor(
                 text=task_prompt,
                 images=image,
                 return_tensors="pt"
+            ).to(device, torch_dtype)
         except Exception as e:
             print(f"❌ Error processing image: {e}")
             return f"❌ Error processing image: {str(e)}"
+        # 按照官方文檔生成
         try:
             generated_ids = model.generate(
                 input_ids=inputs["input_ids"],
                 pixel_values=inputs["pixel_values"],
+                max_new_tokens=1024,
+                do_sample=False,
+                num_beams=3,
             )
         except Exception as e:
             print(f"❌ Error generating text: {e}")
             return f"❌ Error during analysis: {str(e)}"
+        # 解碼
         try:
             generated_text = processor.batch_decode(
                 generated_ids,
+                skip_special_tokens=False
             )[0]
         except Exception as e:
             print(f"❌ Error decoding text: {e}")
             return f"❌ Error decoding result: {str(e)}"
+        # 使用 post_process_generation 解析結果
+        try:
+            parsed_answer = processor.post_process_generation(
+                generated_text,
+                task=task_prompt,
+                image_size=(image.width, image.height)
+            )
+            # 將結果轉換為可讀字符串
+            if isinstance(parsed_answer, dict):
+                result_str = ""
+                for key, value in parsed_answer.items():
+                    result_str += f"{key}:\n{value}\n\n"
+                return result_str.strip()
+            else:
+                return str(parsed_answer)
+        except Exception as e:
+            print(f"❌ Error in post-processing: {e}")
+            # 如果後處理失敗，返回原始生成的文本
+            return f"Raw output: {generated_text}"
     except Exception as e:
         print(f"❌ Error in Florence-2 analysis: {e}")
                 controlnet_model_name = get_controlnet_model(controlnet_type)
                 controlnet = ControlNetModel.from_pretrained(
                     controlnet_model_name,
+                    torch_dtype=torch_dtype
                 ).to(device)
                 pipe = StableDiffusionXLPipeline.from_pretrained(
                     model_name,
                     controlnet=controlnet,
+                    torch_dtype=torch_dtype,
                     safety_checker=None,
                     requires_safety_checker=False,
                     use_safetensors=True,
+                    variant="fp16" if torch_dtype == torch.float16 else None
                 ).to(device)
             else:
                 raise ValueError(f"SDXL model {model_name} only supports limited ControlNet types: {list(SDXL_CONTROLNET_MODELS.keys())}")
             controlnet_model_name = get_controlnet_model(controlnet_type)
             controlnet = ControlNetModel.from_pretrained(
                 controlnet_model_name,
+                torch_dtype=torch_dtype
             ).to(device)
             pipe = StableDiffusionControlNetPipeline.from_pretrained(
                 model_name,
                 controlnet=controlnet,
+                torch_dtype=torch_dtype,
                 safety_checker=None,
                 requires_safety_checker=False,
                 use_safetensors=True,
+                variant="fp16" if torch_dtype == torch.float16 else None
             ).to(device)
         # Apply LoRA if specified
                 # Load base and refiner
                 CURRENT_T2I_PIPE = StableDiffusionXLPipeline.from_pretrained(
                     "stabilityai/stable-diffusion-xl-base-1.0",
+                    torch_dtype=torch_dtype,
                     safety_checker=None,
                     requires_safety_checker=False,
                     use_safetensors=True,
+                    variant="fp16" if torch_dtype == torch.float16 else None
                 ).to(device)
                 CURRENT_SDXL_REFINER = StableDiffusionXLPipeline.from_pretrained(
                     model_name,
+                    torch_dtype=torch_dtype,
                     safety_checker=None,
                     requires_safety_checker=False,
                     use_safetensors=True,
+                    variant="fp16" if torch_dtype == torch.float16 else None,
                     text_encoder_2=CURRENT_T2I_PIPE.text_encoder_2,
                     vae=CURRENT_T2I_PIPE.vae
                 ).to(device)
             else:
                 CURRENT_T2I_PIPE = StableDiffusionXLPipeline.from_pretrained(
                     model_name,
+                    torch_dtype=torch_dtype,
                     safety_checker=None,
                     requires_safety_checker=False,
                     use_safetensors=True,
+                    variant="fp16" if torch_dtype == torch.float16 else None
                 ).to(device)
                 print(f"✅ Loaded SDXL model: {model_name}")
         else:
             # Load SD1.5 model
             CURRENT_T2I_PIPE = StableDiffusionPipeline.from_pretrained(
                 model_name,
+                torch_dtype=torch_dtype,
                 safety_checker=None,
                 requires_safety_checker=False,
                 use_safetensors=True,
+                variant="fp16" if torch_dtype == torch.float16 else None
             ).to(device)
             print(f"✅ Loaded SD1.5 model: {model_name}")
             if is_sdxl_model(model_name):
                 CURRENT_T2I_PIPE = StableDiffusionXLPipeline.from_pretrained(
                     model_name,
+                    torch_dtype=torch_dtype,
                     safety_checker=None,
                     requires_safety_checker=False
                 ).to(device)
             else:
                 CURRENT_T2I_PIPE = StableDiffusionPipeline.from_pretrained(
                     model_name,
+                    torch_dtype=torch_dtype,
                     safety_checker=None,
                     requires_safety_checker=False
                 ).to(device)
         gr.Markdown("""
         ### Microsoft Florence-2 Vision Language Model
         **Pre-trained Tasks:**
+        - `<OCR>`: Text recognition (Extract text from image)
+        - `<CAPTION>`: Image captioning (Generate a caption)
+        - `<DETAILED_CAPTION>`: Detailed caption (More detailed description)
+        - `<MORE_DETAILED_CAPTION>`: More detailed caption (Even more details)
+        - `<OD>`: Object detection (Detect objects with bounding boxes)
         - `<OPEN_VOCABULARY_DETECTION>`: Open-vocabulary detection
         - `<REGION_PROPOSAL>`: Region proposal
+        **How to use:**
+        1. Upload an image
+        2. Select a task from the dropdown
+        3. Click "Analyze Image"
+        4. Results will be displayed in the text box
+        **Example tasks:**
+        - Extract text from a document: `<OCR>`
+        - Describe what's in the image: `<CAPTION>`
+        - Detect objects in the image: `<OD>`
         """)
         with gr.Row():