Spaces:

Svngoku
/

mistral-ocr-demo

Running

App Files Files Community

Svngoku commited on 25 days ago

Commit

f2a7627

verified ·

1 Parent(s): 23df37a

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -147

app.py CHANGED Viewed

@@ -6,19 +6,19 @@ import shutil
 import time
 import pymupdf as fitz
 import logging
-import mimetypes
-from mistralai import Mistral
 from mistralai.models import OCRResponse
 from typing import Union, List, Tuple, Optional, Dict
 from tenacity import retry, stop_after_attempt, wait_exponential
 import tempfile
 # Constants
-SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg", ".avif"]
-SUPPORTED_DOCUMENT_TYPES = [".pdf"]
 UPLOAD_FOLDER = "./uploads"
 MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
-MAX_PDF_PAGES = 50  # Not used anymore, kept for reference
 # Configuration
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
@@ -34,7 +34,6 @@ class OCRProcessor:
         if not api_key or not isinstance(api_key, str):
             raise ValueError("Valid API key must be provided")
         self.client = Mistral(api_key=api_key)
-        self.file_ids_to_delete = []
         self._validate_client()
     def _validate_client(self) -> None:
@@ -46,52 +45,93 @@ class OCRProcessor:
             raise ValueError(f"API key validation failed: {str(e)}")
     @staticmethod
-    def _check_file_size(file_path: str) -> None:
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"File not found: {file_path}")
-        size = os.path.getsize(file_path)
         if size > MAX_FILE_SIZE:
             raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
-    def _upload_file_for_ocr(self, file_path: str) -> str:
-        filename = os.path.basename(file_path)
         try:
-            with open(file_path, "rb") as f:
-                uploaded_file = self.client.files.upload(
-                    file={"file_name": filename, "content": f},
-                    purpose="ocr"
-                )
-            self.file_ids_to_delete.append(uploaded_file.id)
-            signed_url = self.client.files.get_signed_url(uploaded_file.id)
-            return signed_url.url
         except Exception as e:
-            logger.error(f"Failed to upload file {filename}: {str(e)}")
-            raise ValueError(f"Failed to upload file: {str(e)}")
     @staticmethod
-    def _convert_first_page(pdf_path: str) -> Optional[str]:
         try:
             pdf_document = fitz.open(pdf_path)
-            if pdf_document.page_count == 0:
                 pdf_document.close()
-                return None
-            page = pdf_document[0]
-            pix = page.get_pixmap(dpi=100)
-            img_path = os.path.join(UPLOAD_FOLDER, f"preview_{int(time.time())}.png")
-            pix.save(img_path)
             pdf_document.close()
-            return img_path
         except Exception as e:
-            logger.error(f"Error converting first page of {pdf_path}: {str(e)}")
-            return None
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
-    def _call_ocr_api(self, document: Dict) -> OCRResponse:
         try:
             logger.info("Calling OCR API")
             response = self.client.ocr.process(
                 model="mistral-ocr-latest",
-                document=document,
                 include_image_base64=True
             )
             return response
@@ -99,61 +139,64 @@ class OCRProcessor:
             logger.error(f"OCR API call failed: {str(e)}")
             raise
-    def process_file(self, file: gr.File) -> Tuple[str, str]:
         """Process uploaded file (image or PDF)."""
         if not file:
-            return "## No file provided", ""
-        file_path = file.name
-        self._check_file_size(file_path)
-        file_name = os.path.basename(file_path)
-        ext = os.path.splitext(file_name)[1].lower()
-        try:
-            if ext in SUPPORTED_IMAGE_TYPES:
-                mime_type, _ = mimetypes.guess_type(file_path)
-                if mime_type is None:
-                    mime_type = "image/png"
-                with open(file_path, "rb") as image_file:
-                    image_data = image_file.read()
-                base64_encoded = base64.b64encode(image_data).decode('utf-8')
-                data_url = f"data:{mime_type};base64,{base64_encoded}"
-                document = {"type": "image_url", "image_url": data_url}
-                response = self._call_ocr_api(document)
-                markdown = self._combine_markdown(response)
-                return markdown, file_path
-            elif ext in SUPPORTED_DOCUMENT_TYPES:
-                signed_url = self._upload_file_for_ocr(file_path)
-                document = {"type": "document_url", "document_url": signed_url}
-                response = self._call_ocr_api(document)
                 markdown = self._combine_markdown(response)
-                return markdown, file_path
-            else:
-                return f"## Unsupported file type. Supported: {', '.join(SUPPORTED_IMAGE_TYPES + SUPPORTED_DOCUMENT_TYPES)}", file_path
-        except Exception as e:
-            logger.error(f"Error processing file {file_name}: {str(e)}")
-            return f"## Error processing file: {str(e)}", file_path
-    def process_url(self, url: str) -> Tuple[str, str]:
         """Process URL (image or PDF)."""
         if not url:
-            return "## No URL provided", ""
-        parsed_url = url.split('/')[-1] if '/' in url else url
-        ext = os.path.splitext(parsed_url)[1].lower()
-        try:
-            if ext in SUPPORTED_IMAGE_TYPES:
-                document = {"type": "image_url", "image_url": url}
-                response = self._call_ocr_api(document)
-                markdown = self._combine_markdown(response)
-                return markdown, url
-            elif ext in SUPPORTED_DOCUMENT_TYPES:
-                document = {"type": "document_url", "document_url": url}
-                response = self._call_ocr_api(document)
                 markdown = self._combine_markdown(response)
-                return markdown, url
-            else:
-                return f"## Unsupported URL type. Supported: {', '.join(SUPPORTED_IMAGE_TYPES + SUPPORTED_DOCUMENT_TYPES)}", url
-        except Exception as e:
-            logger.error(f"Error processing URL {url}: {str(e)}")
-            return f"## Error processing URL: {str(e)}", url
     @staticmethod
     def _combine_markdown(response: OCRResponse) -> str:
@@ -173,53 +216,20 @@ class OCRProcessor:
             markdown_parts.append(markdown)
         return "\n\n".join(markdown_parts) or "## No text detected"
-def update_file_preview(file):
-    if not file:
-        return gr.update(value=[])
-    ext = os.path.splitext(os.path.basename(file.name))[1].lower()
-    if ext in SUPPORTED_IMAGE_TYPES:
-        return gr.update(value=[file.name])
-    elif ext in SUPPORTED_DOCUMENT_TYPES:
-        first_page = OCRProcessor._convert_first_page(file.name)
-        return gr.update(value=[first_page] if first_page else [])
-    else:
-        return gr.update(value=[])
-def update_url_preview(url):
-    if not url:
-        return gr.update(value=[])
-    parsed_url = url.split('/')[-1] if '/' in url else url
-    ext = os.path.splitext(parsed_url)[1].lower()
-    if ext in SUPPORTED_IMAGE_TYPES:
-        return gr.update(value=[url])
-    elif ext == '.pdf':  # Only preview PDFs
-        try:
-            response = requests.get(url, timeout=10, stream=True)
-            response.raise_for_status()
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
-                shutil.copyfileobj(response.raw, temp_pdf)
-                temp_pdf_path = temp_pdf.name
-            first_page = OCRProcessor._convert_first_page(temp_pdf_path)
-            os.unlink(temp_pdf_path)
-            return gr.update(value=[first_page] if first_page else [])
-        except Exception as e:
-            logger.error(f"URL preview error: {str(e)}")
-            return gr.update(value=[])
-    else:
-        return gr.update(value=[])
 def create_interface():
     css = """
     .output-markdown {font-size: 14px; max-height: 500px; overflow-y: auto;}
     .status {color: #666; font-style: italic;}
     .preview {max-height: 300px;}
     """
     with gr.Blocks(title="Mistral OCR Demo", css=css) as demo:
         gr.Markdown("# Mistral OCR Demo")
-        gr.Markdown(f"""Process PDFs and images (max {MAX_FILE_SIZE/1024/1024}MB) via upload or URL.
-        Supported: Images ({', '.join(SUPPORTED_IMAGE_TYPES)}), Documents ({', '.join(SUPPORTED_DOCUMENT_TYPES)}).
-        View previews and OCR results with embedded images.
-        Learn more at [Mistral OCR](https://docs.mistral.ai/capabilities/document_ai/basic_ocr).""")
         # API Key Setup
         with gr.Row():
@@ -234,61 +244,66 @@ def create_interface():
                 return processor, "✅ API key validated"
             except Exception as e:
                 return None, f"❌ Error: {str(e)}"
         set_key_btn.click(fn=init_processor, inputs=api_key_input, outputs=[processor_state, status])
         # File Upload Tab
         with gr.Tab("Upload File"):
             with gr.Row():
-                file_input = gr.File(label="Upload Image/PDF", file_types=SUPPORTED_IMAGE_TYPES + SUPPORTED_DOCUMENT_TYPES)
-            file_preview = gr.Gallery(label="Preview", elem_classes="preview")
             file_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
-            file_raw_output = gr.Textbox(label="Source Path")
             file_button = gr.Button("Process", variant="primary")
-            file_input.change(fn=update_file_preview, inputs=file_input, outputs=file_preview)
-            def process_file_fn(p, f):
-                if not p:
-                    return "## Set API key first", ""
-                return p.process_file(f)
             file_button.click(
-                fn=process_file_fn,
                 inputs=[processor_state, file_input],
-                outputs=[file_output, file_raw_output]
             )
         # URL Tab
         with gr.Tab("URL Input"):
             with gr.Row():
-                url_input = gr.Textbox(label="URL to Image/PDF")
-            url_preview = gr.Gallery(label="Preview", elem_classes="preview")
             url_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
-            url_raw_output = gr.Textbox(label="Source URL")
             url_button = gr.Button("Process", variant="primary")
-            url_input.change(fn=update_url_preview, inputs=url_input, outputs=url_preview)
-            def process_url_fn(p, u):
-                if not p:
-                    return "## Set API key first", ""
-                return p.process_url(u)
             url_button.click(
-                fn=process_url_fn,
                 inputs=[processor_state, url_input],
-                outputs=[url_output, url_raw_output]
             )
         gr.Examples(
             examples=[],
             inputs=[file_input, url_input]
         )
     return demo
 if __name__ == "__main__":
     os.environ['START_TIME'] = time.strftime('%Y-%m-%d %H:%M:%S')
-    print(f"===== Application Startup at {os.environ['START_TIME']} ===")
-    demo = create_interface()
-    demo.launch(share=True, max_threads=1)

 import time
 import pymupdf as fitz
 import logging
+from mistralai import Mistral, ImageURLChunk
 from mistralai.models import OCRResponse
 from typing import Union, List, Tuple, Optional, Dict
 from tenacity import retry, stop_after_attempt, wait_exponential
+from concurrent.futures import ThreadPoolExecutor
 import tempfile
 # Constants
+SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg"]
+SUPPORTED_PDF_TYPES = [".pdf"]
 UPLOAD_FOLDER = "./uploads"
 MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
+MAX_PDF_PAGES = 50
 # Configuration
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
         if not api_key or not isinstance(api_key, str):
             raise ValueError("Valid API key must be provided")
         self.client = Mistral(api_key=api_key)
         self._validate_client()
     def _validate_client(self) -> None:
             raise ValueError(f"API key validation failed: {str(e)}")
     @staticmethod
+    def _check_file_size(file_input: Union[str, bytes]) -> None:
+        if isinstance(file_input, str) and os.path.exists(file_input):
+            size = os.path.getsize(file_input)
+        elif hasattr(file_input, 'read'):
+            size = len(file_input.read())
+            file_input.seek(0)
+        else:
+            size = len(file_input)
         if size > MAX_FILE_SIZE:
             raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
+    @staticmethod
+    def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
+        clean_filename = os.path.basename(filename).replace(os.sep, "_")
+        file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{clean_filename}")
         try:
+            if isinstance(file_input, str) and file_input.startswith("http"):
+                response = requests.get(file_input, timeout=30)
+                response.raise_for_status()
+                with open(file_path, 'wb') as f:
+                    f.write(response.content)
+            elif isinstance(file_input, str) and os.path.exists(file_input):
+                shutil.copy2(file_input, file_path)
+            else:
+                with open(file_path, 'wb') as f:
+                    if hasattr(file_input, 'read'):
+                        shutil.copyfileobj(file_input, f)
+                    else:
+                        f.write(file_input)
+            if not os.path.exists(file_path):
+                raise FileNotFoundError(f"Failed to save file at {file_path}")
+            return file_path
         except Exception as e:
+            logger.error(f"Error saving file {filename}: {str(e)}")
+            raise
     @staticmethod
+    def _encode_image(image_path: str) -> str:
+        try:
+            with open(image_path, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode('utf-8')
+        except Exception as e:
+            logger.error(f"Error encoding image {image_path}: {str(e)}")
+            raise ValueError(f"Failed to encode image: {str(e)}")
+    @staticmethod
+    def _pdf_to_images(pdf_path: str) -> List[Tuple[str, str]]:
         try:
             pdf_document = fitz.open(pdf_path)
+            if pdf_document.page_count > MAX_PDF_PAGES:
                 pdf_document.close()
+                raise ValueError(f"PDF exceeds maximum page limit of {MAX_PDF_PAGES}")
+            with ThreadPoolExecutor() as executor:
+                image_data = list(executor.map(
+                    lambda i: OCRProcessor._convert_page(pdf_path, i),
+                    range(pdf_document.page_count)
+                ))
             pdf_document.close()
+            return [data for data in image_data if data]
         except Exception as e:
+            logger.error(f"Error converting PDF to images: {str(e)}")
+            return []
+    @staticmethod
+    def _convert_page(pdf_path: str, page_num: int) -> Tuple[str, str]:
+        try:
+            pdf_document = fitz.open(pdf_path)
+            page = pdf_document[page_num]
+            pix = page.get_pixmap(dpi=150)
+            image_path = os.path.join(UPLOAD_FOLDER, f"page_{page_num + 1}_{int(time.time())}.png")
+            pix.save(image_path)
+            encoded = OCRProcessor._encode_image(image_path)
+            pdf_document.close()
+            return image_path, encoded
+        except Exception as e:
+            logger.error(f"Error converting page {page_num}: {str(e)}")
+            return None, None
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
+    def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
+        base64_url = f"data:image/png;base64,{encoded_image}"
         try:
             logger.info("Calling OCR API")
             response = self.client.ocr.process(
                 model="mistral-ocr-latest",
+                document=ImageURLChunk(image_url=base64_url),
                 include_image_base64=True
             )
             return response
             logger.error(f"OCR API call failed: {str(e)}")
             raise
+    def process_file(self, file: gr.File) -> Tuple[str, str, List[str]]:
         """Process uploaded file (image or PDF)."""
         if not file:
+            return "## No file provided", "", []
+        file_name = file.name
+        self._check_file_size(file)
+        file_path = self._save_uploaded_file(file, file_name)
+        if file_name.lower().endswith(tuple(SUPPORTED_IMAGE_TYPES)):
+            encoded_image = self._encode_image(file_path)
+            response = self._call_ocr_api(encoded_image)
+            markdown = self._combine_markdown(response)
+            return markdown, file_path, [file_path]
+        elif file_name.lower().endswith('.pdf'):
+            image_data = self._pdf_to_images(file_path)
+            if not image_data:
+                return "## No pages converted from PDF", file_path, []
+            ocr_results = []
+            image_paths = [path for path, _ in image_data]
+            for _, encoded in image_data:
+                response = self._call_ocr_api(encoded)
                 markdown = self._combine_markdown(response)
+                ocr_results.append(markdown)
+            return "\n\n".join(ocr_results), file_path, image_paths
+        return "## Unsupported file type", file_path, []
+    def process_url(self, url: str) -> Tuple[str, str, List[str]]:
         """Process URL (image or PDF)."""
         if not url:
+            return "## No URL provided", "", []
+        file_name = url.split('/')[-1] or f"file_{int(time.time())}"
+        file_path = self._save_uploaded_file(url, file_name)
+        if file_name.lower().endswith(tuple(SUPPORTED_IMAGE_TYPES)):
+            encoded_image = self._encode_image(file_path)
+            response = self._call_ocr_api(encoded_image)
+            markdown = self._combine_markdown(response)
+            return markdown, url, [file_path]
+        elif file_name.lower().endswith('.pdf'):
+            image_data = self._pdf_to_images(file_path)
+            if not image_data:
+                return "## No pages converted from PDF", url, []
+            ocr_results = []
+            image_paths = [path for path, _ in image_data]
+            for _, encoded in image_data:
+                response = self._call_ocr_api(encoded)
                 markdown = self._combine_markdown(response)
+                ocr_results.append(markdown)
+            return "\n\n".join(ocr_results), url, image_paths
+        return "## Unsupported URL content type", url, []
     @staticmethod
     def _combine_markdown(response: OCRResponse) -> str:
             markdown_parts.append(markdown)
         return "\n\n".join(markdown_parts) or "## No text detected"
 def create_interface():
     css = """
     .output-markdown {font-size: 14px; max-height: 500px; overflow-y: auto;}
     .status {color: #666; font-style: italic;}
     .preview {max-height: 300px;}
     """
     with gr.Blocks(title="Mistral OCR Demo", css=css) as demo:
         gr.Markdown("# Mistral OCR Demo")
+        gr.Markdown(f"""
+            Process PDFs and images (max {MAX_FILE_SIZE/1024/1024}MB, {MAX_PDF_PAGES} pages for PDFs) via upload or URL.
+            View previews and OCR results with embedded images.
+            Learn more at [Mistral OCR](https://mistral.ai/news/mistral-ocr).
+        """)
         # API Key Setup
         with gr.Row():
                 return processor, "✅ API key validated"
             except Exception as e:
                 return None, f"❌ Error: {str(e)}"
         set_key_btn.click(fn=init_processor, inputs=api_key_input, outputs=[processor_state, status])
         # File Upload Tab
         with gr.Tab("Upload File"):
             with gr.Row():
+                file_input = gr.File(label="Upload PDF/Image", file_types=SUPPORTED_IMAGE_TYPES + SUPPORTED_PDF_TYPES)
+                file_preview = gr.Gallery(label="Preview", elem_classes="preview")
             file_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
+            file_raw_output = gr.Textbox(label="Raw File Path")
             file_button = gr.Button("Process", variant="primary")
+            def update_file_preview(file):
+                return [file.name] if file else []
+            file_input.change(fn=update_file_preview, inputs=file_input, outputs=file_preview)
             file_button.click(
+                fn=lambda p, f: p.process_file(f) if p else ("## Set API key first", "", []),
                 inputs=[processor_state, file_input],
+                outputs=[file_output, file_raw_output, file_preview]
             )
         # URL Tab
         with gr.Tab("URL Input"):
             with gr.Row():
+                url_input = gr.Textbox(label="URL to PDF/Image")
+                url_preview = gr.Gallery(label="Preview", elem_classes="preview")
             url_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
+            url_raw_output = gr.Textbox(label="Raw URL")
             url_button = gr.Button("Process", variant="primary")
+            def update_url_preview(url):
+                if not url:
+                    return []
+                try:
+                    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.tmp')
+                    response = requests.get(url, timeout=10)
+                    temp_file.write(response.content)
+                    temp_file.close()
+                    return [temp_file.name]
+                except Exception as e:
+                    logger.error(f"URL preview error: {str(e)}")
+                    return []
+            url_input.change(fn=update_url_preview, inputs=url_input, outputs=url_preview)
             url_button.click(
+                fn=lambda p, u: p.process_url(u) if p else ("## Set API key first", "", []),
                 inputs=[processor_state, url_input],
+                outputs=[url_output, url_raw_output, url_preview]
             )
+        # Examples
         gr.Examples(
             examples=[],
             inputs=[file_input, url_input]
         )
     return demo
 if __name__ == "__main__":
     os.environ['START_TIME'] = time.strftime('%Y-%m-%d %H:%M:%S')
+    print(f"===== Application Startup at {os.environ['START_TIME']} =====")
+    create_interface().launch(share=True, max_threads=1)