There are three OCR softwares that I decided to test.
Here are some test cases that have been polished by running output text through a filter that removes lines only containing 1-3 characters. The full text code is at the bottom of the page. I am aware I could also filter text through a word dictionary, but there are none that account for a huge expanse of the English language with all it’s slang and word-play.
My conclusion is ocrs
is the best in terms of accuracy
and speed, often finishing 10x faster than doctr
.
Tesseract
kinda sucks.
Time: 0.360 seconds
>>77914072 >>77922406
Fuck it's so over. Relationships are out of the question now totally
Time: 1.235 seconds
Anonymous 06/20/240hu132643 No.77897896 >77914072 >77922406
Fuck it's SO over. Relationships are out of the question now totally
Time: 0.258 seconds
[ Anonymous 06/20/24(Thu)13:26:43 No.77897896 >>77914072 >>77922406
Fuck it's so over. Relationships are out of the question now totally
Time: 0.168 seconds
Time: 0.991 seconds
Time: 0.184 seconds
Time: 0.094 seconds
THE BILINDA BUTCHERS,
Time: 0.987 seconds
THE BILINDA BUTCHERS
Time: 0.143 seconds
THE BILINDA BUTCHERS
Time: 0.098 seconds
are deat to the]
as of this world
Time: 1.084 seconds
waouuoos
oottos
you are deaf to the
sins of this world
FIRECE You are the woret
person - met on
toontown
Time: 0.181 seconds
VIVON
n4pu
the woret
parcon Lmet
toontown
you are deaf to the
sins of this world
UUEN
FIRECE
Time: 0.139 seconds
rN | ya|
* Max boost is the maximum chievable by a sit
pos te ria pa et iba aos
vat a ued ne ad
io Pertierra ereeiond a
Time: 1.332 seconds
- Max boost is the maximum frequency achievable by a single core on
the processor running a bursty single-threaded workload. Max boost will
vary based on several factors, induding, but not limited to: thermal paste.
system copling, motherboard design and BIOS, the latest AMD chipset
driver, and the latest 05 updates. Visita amd -
com for more details.
Time: 0.217 seconds
Max boost
the processor running bursty sinle-threaded workload. Max boost will
vary based on several factors. including, but not limited to thermal paste
system ronling: motherboard desin and BIOS the latest AMD chipset
driver: and the latest 05 updates Visit amd.com for more details
the maximum frequency achievable by
sinele core on
Time: 0.835 seconds
Selection
Time: 1.331 seconds
SOAEnTS
M4LBle
Imisiee
MIEEIIS
MAISEREEI MEMSAGSUNE
TK Sum4 Rau
Smage
TRISHNEANS
NNISFREE
sale
FguidSb.fion TheNaturdl
Selection
Maguires
KITS
Barbers
e-juice
Traditional Gents Barber Shop
Time: 0.430 seconds
SOuVenRs
c dvalasle 1Sriumnk
Aa?exsonstown
IW NEWS
USTTPO
NNISFREE NGIWSAGENTS
THR IRISHINEINS OE S PRe
AADCEON
CN TH"mage
URCLINI
-INNISFREE-
sale
liquid C
e Ia
-10%
KITS
e-iuice
Maguires
?Barbers
7adional Gonts Barber Sha
import subprocess
from typing import List
from doctr.io import DocumentFile
from doctr.io.elements import Page
from doctr.models import ocr_predictor
from doctr.models.predictor.pytorch import OCRPredictor
from numpy import ndarray
import pytesseract
import re
import string
from time import perf_counter
def apply_text_filter(func):
def wrapper(self, *args, **kwargs):
= func(self, *args, **kwargs)
text return self.text_filter(text)
return wrapper
class OCRBase:
def __init__(self):
pass
def process(self):
pass
def text_filter(self, text):
return re.sub(r'(?m)^\s*\S{1,3}\s*$\n?', '', text).strip()
class OCRDoctr(OCRBase):
"""
python3.12 -m pip install doctr
"""
def __init__(self):
print('Loading OCR Model...')
self.model: OCRPredictor = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
print('Finished')
@apply_text_filter
def process(self, image_path):
= DocumentFile.from_images(image_path)
doc: List[ndarray] = self.model(doc)
result: Page return result.render()
class OCRTerreract(OCRBase):
"""
sudo apt update
sudo apt install tesseract-ocr
sudo apt install libtesseract-dev
python3.12 -m pip install pytesseract
"""
def __init__(self):
= string.printable.replace('"', '\\"').replace("'", "\\'")
whitelist self.config = fr'-c tessedit_char_whitelist=" {whitelist}"'
pass
@apply_text_filter
def process(self, image_path):
= pytesseract.image_to_string(image_path, config=self.config, timeout=20)
text return text
class OCRRobertKnight(OCRBase):
"""
curl https://sh.rustup.rs -sSf | sh
sudo apt install cargo
cargo install ocrs-cli
"""
def __init__(self):
pass
@apply_text_filter
def process(self, image_path):
= subprocess.run(['ocrs', image_path], capture_output=True, text=True).stdout
text return text
if __name__ == "__main__":
= {
file_paths '/home/USER/Documents/images/images/1719103056310.png': 'its over',
'/home/USER/Documents/images/images/halt.png': 'grinch',
'/home/USER/Documents/images/images/th-1384675618.jpeg': 'heaven',
'/home/USER/Documents/images/images/toontown.jpg': 'towntown',
'/home/USER/Documents/images/images/max_boost.png': 'max_boost',
'/home/USER/Documents/images/images/ireland.jpg': 'ireland',
}
= {
d 'tesseract': OCRTerreract(),
'doctr': OCRDoctr(),
'ocrs': OCRRobertKnight(),
}
for file_path, file_alias in file_paths.items():
print()
print('---')
print()
for name, obj in d.items():
= perf_counter()
start = obj.process(file_path)
text = perf_counter() - start
elapsed_time
print('- ![](url){style="max-width: 500px;"}')
print(f'### Library: {name}')
print()
print(f'**Time:** {elapsed_time:.3f} seconds')
print()
print(f'```text\n{text}\n```')