Error when running with transformers.
OSError: It looks like the config file at '/root/.cache/huggingface/hub/models--microsoft--OmniParser-v2.0/snapshots/09fae83e39987d4c19e676253380033a0603dbc3/config.json' is not a valid JSON file.
Same
same issue
+1
+1
same here
Hi everyone, I got it running on my MacBook using this script:
from ultralytics import YOLO
from huggingface_hub import hf_hub_download
# Download the model
model_path = hf_hub_download(
repo_id="microsoft/OmniParser-v2.0",
filename="icon_detect/model.pt",
repo_type="model"
)
# Load the model
model = YOLO(model_path)
# Run inference on an image
results = model("test_image.jpg")
# Print detections
for r in results:
for box in r.boxes:
confidence = box.conf.item()
coords = box.xyxy[0].tolist()
print(f"Detected icon with confidence {confidence:.3f} at location {coords}")
ensure you have requirements.txt with the following:
torch>=2.0.0
ultralytics
huggingface-hub
or simply run: pip install torch>=2.0.0 ultralytics huggingface-hub
Additionally, ensure you are using virtual environment:
On Windows:
python -m venv venv
venv\Scripts\activate
On macOS/Linux:
python3 -m venv venv
source venv/bin/activate
Once done, you can run the script with this command: python simple_detect.py
Don't forget to replace test_image.jpg
in the script with the image name you want to detect elements on!
Note that this will only output the text, you can further modify the code to show the bounding boxes according to your needs.
OSError: It looks like the config file at '/root/.cache/huggingface/hub/models--microsoft--OmniParser-v2.0/snapshots/09fae83e39987d4c19e676253380033a0603dbc3/config.json' is not a valid JSON file.
+1
+1
plus one!!!!!!‘‘‘‘‘‘‘‘‘‘‘‘ I want to use this model to hack the worldππππ Lets go baby!!!!!!!
+1
+1 appreciate a fix! :)
+1 can't use the model
config.json file is empty
guys I have updated the config.json file I think you can use it now
guys I have updated the config.json file I think you can use it now
No it doesnt work
yes its not working i checked it soon as i updated the changes has taken place only in my account not in the main account
It's been 20days and the error still persists... Any alternatives for using it on a cpu ?
It's been 20days and the error still persists... Any alternatives for using it on a cpu ?
Yeah its crazy. Unusable
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from PIL import Image
import pytesseract
import google.generativeai as genai
import base64
import torch
import requests
import layoutparser as lp
import numpy as np
import re
def combined_ocr_layoutlmv3_tesseract_gemini_layoutparser(image_path, gemini_api_key, tesseract_path=None):
"""
Performs OCR using a combination of LayoutLMv3, Tesseract, Gemini, and LayoutParser,
corrects the gap issue in LayoutLMv3 output, and removes multiplication symbols.
"""
try:
# LayoutLMv3 OCR (Token Classification)
processor_layoutlm = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-large")
model_layoutlm = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-large")
if image_path.startswith("http://") or image_path.startswith("https://"):
image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB")
else:
image = Image.open(image_path).convert("RGB")
question = "Extract all the text from this document."
inputs = processor_layoutlm(images=image, text=question, return_tensors="pt")
with torch.no_grad():
outputs = model_layoutlm(**inputs)
predicted_token_class_ids = torch.argmax(outputs.logits, dim=-1)
predicted_tokens = processor_layoutlm.tokenizer.batch_decode(predicted_token_class_ids, skip_special_tokens=True)
layoutlm_text = " ".join(predicted_tokens[0])
# Correct gap issue in LayoutLMv3 text
layoutlm_text = re.sub(r"(\d+\.)\s+(\d+D)", r"\1\2", layoutlm_text)
# Tesseract OCR
if tesseract_path:
pytesseract.pytesseract.tesseract_cmd = tesseract_path
tesseract_text = pytesseract.image_to_string(image)
# LayoutParser
model_layoutparser = lp.models.PaddleDetectionLayoutModel(config_path="lp://PubLayNet/ppyolov2_r50vd_dcn_365e/config",
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"})
image_np = np.array(image)
layout_result = model_layoutparser.detect(image_np)
layoutparser_text = ""
for block in layout_result:
if block.type == "Text":
cropped_image = image.crop((block.block.x_1, block.block.y_1, block.block.x_2, block.block.y_2))
layoutparser_text += pytesseract.image_to_string(cropped_image).strip() + " "
# Gemini API
genai.configure(api_key=gemini_api_key)
model_gemini = genai.GenerativeModel('gemini-2.0-flash')
with open(image_path, "rb") as image_file:
image_data = image_file.read()
image_base64 = base64.b64encode(image_data).decode("utf-8")
img = {
"mime_type": "image/jpeg",
"data": image_base64
}
response = model_gemini.generate_content([
f"Here is text extracted by LayoutLMv3: '{layoutlm_text}'. Here is text extracted by tesseract: '{tesseract_text}'. Here is text extracted by LayoutParser: '{layoutparser_text}'. What is the complete, and corrected text in this image?",
img
])
gemini_text = response.text
return gemini_text
except Exception as e:
print(f"Error during combined OCR: {e}")
return None
if name == "main":
gemini_api_key = "AIzaSyBMR0ISY5ouklJ26Hp9xQ_YYr_Uv9ejYEw" # Replace with your Gemini API key
image_file_path = "/content/image_to_text.jpg" # Replace with your image file path
tesseract_executable_path = "/usr/bin/tesseract"
extracted_text = combined_ocr_layoutlmv3_tesseract_gemini_layoutparser(image_file_path, gemini_api_key, tesseract_executable_path)
if extracted_text:
# Save the output to a file
with open("extracted_text.txt", "w") as f:
f.write(extracted_text)
print("Extracted text saved to extracted_text.txt")
else:
print("Combined OCR failed.") # Use this code for ocr purposes it works great. Try this once