Error when running with transformers.

#2
by ammar90 - opened

OSError: It looks like the config file at '/root/.cache/huggingface/hub/models--microsoft--OmniParser-v2.0/snapshots/09fae83e39987d4c19e676253380033a0603dbc3/config.json' is not a valid JSON file.

same issue

Hi everyone, I got it running on my MacBook using this script:

from ultralytics import YOLO
from huggingface_hub import hf_hub_download

# Download the model
model_path = hf_hub_download(
    repo_id="microsoft/OmniParser-v2.0",
    filename="icon_detect/model.pt",
    repo_type="model"
)

# Load the model
model = YOLO(model_path)

# Run inference on an image
results = model("test_image.jpg")

# Print detections
for r in results:
    for box in r.boxes:
        confidence = box.conf.item()
        coords = box.xyxy[0].tolist()
        print(f"Detected icon with confidence {confidence:.3f} at location {coords}")

ensure you have requirements.txt with the following:

torch>=2.0.0
ultralytics
huggingface-hub

or simply run: pip install torch>=2.0.0 ultralytics huggingface-hub

Additionally, ensure you are using virtual environment:
On Windows:

python -m venv venv
venv\Scripts\activate

On macOS/Linux:

python3 -m venv venv
source venv/bin/activate

Once done, you can run the script with this command: python simple_detect.py
Don't forget to replace test_image.jpg in the script with the image name you want to detect elements on!

Note that this will only output the text, you can further modify the code to show the bounding boxes according to your needs.

OSError: It looks like the config file at '/root/.cache/huggingface/hub/models--microsoft--OmniParser-v2.0/snapshots/09fae83e39987d4c19e676253380033a0603dbc3/config.json' is not a valid JSON file.

+1

plus one!!!!!!‘‘‘‘‘‘‘‘‘‘‘‘ I want to use this model to hack the world😎😎😎😎 Lets go baby!!!!!!!

+1 appreciate a fix! :)

+1 can't use the model

config.json file is empty

guys I have updated the config.json file I think you can use it now

guys I have updated the config.json file I think you can use it now
No it doesnt work

yes its not working i checked it soon as i updated the changes has taken place only in my account not in the main account

It's been 20days and the error still persists... Any alternatives for using it on a cpu ?

It's been 20days and the error still persists... Any alternatives for using it on a cpu ?

Yeah its crazy. Unusable

from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from PIL import Image
import pytesseract
import google.generativeai as genai
import base64
import torch
import requests
import layoutparser as lp
import numpy as np
import re

def combined_ocr_layoutlmv3_tesseract_gemini_layoutparser(image_path, gemini_api_key, tesseract_path=None):
"""
Performs OCR using a combination of LayoutLMv3, Tesseract, Gemini, and LayoutParser,
corrects the gap issue in LayoutLMv3 output, and removes multiplication symbols.
"""
try:
# LayoutLMv3 OCR (Token Classification)
processor_layoutlm = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-large")
model_layoutlm = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-large")

    if image_path.startswith("http://") or image_path.startswith("https://"):
        image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB")
    else:
        image = Image.open(image_path).convert("RGB")

    question = "Extract all the text from this document."
    inputs = processor_layoutlm(images=image, text=question, return_tensors="pt")
    with torch.no_grad():
        outputs = model_layoutlm(**inputs)
    predicted_token_class_ids = torch.argmax(outputs.logits, dim=-1)
    predicted_tokens = processor_layoutlm.tokenizer.batch_decode(predicted_token_class_ids, skip_special_tokens=True)
    layoutlm_text = " ".join(predicted_tokens[0])

    # Correct gap issue in LayoutLMv3 text
    layoutlm_text = re.sub(r"(\d+\.)\s+(\d+D)", r"\1\2", layoutlm_text)

    # Tesseract OCR
    if tesseract_path:
        pytesseract.pytesseract.tesseract_cmd = tesseract_path
    tesseract_text = pytesseract.image_to_string(image)

    # LayoutParser
    model_layoutparser = lp.models.PaddleDetectionLayoutModel(config_path="lp://PubLayNet/ppyolov2_r50vd_dcn_365e/config",
                                                                label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"})

    image_np = np.array(image)
    layout_result = model_layoutparser.detect(image_np)
    layoutparser_text = ""
    for block in layout_result:
        if block.type == "Text":
            cropped_image = image.crop((block.block.x_1, block.block.y_1, block.block.x_2, block.block.y_2))
            layoutparser_text += pytesseract.image_to_string(cropped_image).strip() + " "

    # Gemini API
    genai.configure(api_key=gemini_api_key)
    model_gemini = genai.GenerativeModel('gemini-2.0-flash')

    with open(image_path, "rb") as image_file:
        image_data = image_file.read()
        image_base64 = base64.b64encode(image_data).decode("utf-8")

    img = {
        "mime_type": "image/jpeg",
        "data": image_base64
    }

    response = model_gemini.generate_content([
        f"Here is text extracted by LayoutLMv3: '{layoutlm_text}'. Here is text extracted by tesseract: '{tesseract_text}'. Here is text extracted by LayoutParser: '{layoutparser_text}'. What is the complete, and corrected text in this image?",
        img
    ])
    gemini_text = response.text

    return gemini_text

except Exception as e:
    print(f"Error during combined OCR: {e}")
    return None

if name == "main":
gemini_api_key = "AIzaSyBMR0ISY5ouklJ26Hp9xQ_YYr_Uv9ejYEw" # Replace with your Gemini API key
image_file_path = "/content/image_to_text.jpg" # Replace with your image file path
tesseract_executable_path = "/usr/bin/tesseract"

extracted_text = combined_ocr_layoutlmv3_tesseract_gemini_layoutparser(image_file_path, gemini_api_key, tesseract_executable_path)

if extracted_text:
    # Save the output to a file
    with open("extracted_text.txt", "w") as f:
        f.write(extracted_text)
    print("Extracted text saved to extracted_text.txt")
else:
    print("Combined OCR failed.")               # Use this code for ocr purposes it works great. Try this once

Sign up or log in to comment