Thanks Andreas for adding this new OCR / Text Recognition on Selected Area On Screen Action!
@Andreas_Hegenberg Would you consider adding an option or a new Action to perform OCR using OpenAI's vision API? Users could provide their own API key to use it.
In case it's helpful to anyone, a while back I created a simple CLI in Python to easily extract text from a screenshot. What makes it really helpful to me is that I can pass a --method
argument to specify if I want to use tesseract
or openai
for OCR using the vision API.
The benefit of the openai
option is that it's excellent at extracting complicated text, preserving line breaks and indentation. For example, it works great when copying code from a tutorial on a YouTube video.
The benefit of the tesseract
method is that it's free and runs all locally. For example, it works great for copying URLs from YouTube videos.
The drawbacks of the openai
option is that it costs a fraction of a cent per use and you have to trust OpenAI with your data. Their privacy policy states that they will delete the image but I assume everything that I send them will be used for model training.
I can trigger this CLI using keyboard shortcuts using BTT.
To use the tesseract
method:
python ~/projects/screenshot/ocr_tesseract_or_openai.py --method tesseract
To use the openai
method:
python ~/projects/screenshot/ocr_tesseract_or_openai.py --method openai
Here's the ocr_tesseract_or_openai.py
script:
import os
from pathlib import Path
from datetime import datetime
import base64
import argparse
from PIL import Image
import pytesseract
import pyperclip
from openai import OpenAI, OpenAIError
import instructor
from pydantic import BaseModel, Field
from rich import pretty
from rich.console import Console
from rich.traceback import install
import shutil
import logging
from rich.logging import RichHandler
# Install rich traceback handler
install(show_locals=True)
pretty.install()
# Initialize console
console = Console()
# Configure logging
HOME_PATH = Path.home()
# Ensure the screenshots directory exists
SCREENSHOT_DIR = HOME_PATH / "projects/screenshots"
SCREENSHOT_DIR.mkdir(parents=True, exist_ok=True)
LOG_FILE_PATH = SCREENSHOT_DIR / "ocr_script.log"
# Create a custom logger
logger = logging.getLogger("ocr_script")
logger.setLevel(logging.ERROR) # Capture all levels of logs
# Create handlers
console_handler = RichHandler(rich_tracebacks=True)
console_handler.setLevel(logging.INFO) # Set to INFO to avoid debug logs on console
file_handler = logging.FileHandler(LOG_FILE_PATH)
file_handler.setLevel(logging.DEBUG) # Capture all logs in file
# Create formatters and add them to handlers
console_formatter = logging.Formatter("%(message)s", datefmt="[%X]")
file_formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="[%Y-%m-%d %H:%M:%S]",
)
console_handler.setFormatter(console_formatter)
file_handler.setFormatter(file_formatter)
# Add handlers to the logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)
# Function to encode the image
def encode_image(image_path):
logger.debug("Function encode_image(image_path=%s) called.", image_path)
with open(image_path, "rb") as image_file:
encoded = base64.b64encode(image_file.read()).decode("utf-8")
logger.debug("Image encoded successfully.")
return encoded
def take_screenshot():
logger.debug("Function take_screenshot() called.")
if shutil.which("screencapture") is None:
logger.error("The 'screencapture' command is not available on this system.")
return None
timestamp = datetime.now().strftime("%Y%m%d_%H_%M_%S")
screenshot_path = SCREENSHOT_DIR / f"{timestamp}_screenshot.png"
try:
# macOS command for an interactive screenshot
os.system(f"screencapture -i {screenshot_path}")
# Check if the screenshot was taken (i.e., the file exists)
if not os.path.exists(screenshot_path):
logger.warning("Screenshot was canceled.")
return None
logger.info(f"Screenshot saved to {screenshot_path}")
return screenshot_path
except Exception:
logger.exception("Failed to take screenshot:")
return None
# Pydantic model to represent the text extracted from an image
class ExtractedText(BaseModel):
text: str = Field(..., description="The text extracted from the image")
def extract_text_with_openai(image_path):
logger.debug("Function extract_text_with_openai(image_path=%s) called.", image_path)
base64_image = encode_image(image_path)
# Patch the OpenAI client
client = instructor.from_openai(OpenAI())
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Extract the text from this image. If the text contains code, preserve the formatting.",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}"
},
},
],
}
],
max_tokens=2000,
response_model=ExtractedText,
)
logger.info("Text extracted using OpenAI.")
logger.debug("Extracted text: %s", response.text)
return response.text
except OpenAIError:
logger.exception("Failed to extract text using OpenAI:")
return None
def extract_text_with_pytesseract(image_path):
logger.debug(
"Function extract_text_with_pytesseract(image_path=%s) called.", image_path
)
try:
image = Image.open(image_path)
# If Tesseract is not in PATH, specify the full path:
pytesseract.pytesseract.tesseract_cmd = r"/opt/homebrew/bin/tesseract"
text = pytesseract.image_to_string(image, config="--oem 3 --psm 6")
logger.info("Text extracted using pytesseract.")
logger.debug("Extracted text: %s", text)
return text
except Exception:
logger.exception("Failed to extract text using pytesseract:")
return None # Return None instead of the exception object
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="OCR Script")
parser.add_argument(
"--method",
choices=["openai", "tesseract"],
default="tesseract",
help="Specify the OCR method to use: 'openai' or 'tesseract'. Default is 'tesseract'.",
)
args = parser.parse_args()
# Log script start with arguments
logger.info("*" * 80)
logger.info("Script called with arguments: %s", args)
logger.info("*" * 80)
image_path = take_screenshot()
if image_path:
if args.method == "openai":
extracted_text = extract_text_with_openai(image_path)
else:
extracted_text = extract_text_with_pytesseract(image_path)
if extracted_text is not None:
print(extracted_text, end="\n\n")
pyperclip.copy(extracted_text)
logger.info("Text has been copied to the clipboard.")
else:
logger.warning("No text extracted. Exiting.")
else:
logger.warning("No screenshot taken. Exiting.")