Enabling Metrics
The OCR system automatically tracks metrics for all operations. Metrics include files processed, pages, characters, confidence scores, and processing time.Copy
from upsonic.ocr import OCR
from upsonic.ocr.layer_1.engines import EasyOCREngine
# Create engine and orchestrator
engine = EasyOCREngine(languages=['en'])
ocr = OCR(layer_1_ocr_engine=engine)
# Process multiple files
ocr.get_text('document1.pdf')
ocr.get_text('document2.pdf')
ocr.get_text('image.png')
# Get metrics
metrics = ocr.get_metrics()
print(f"Files processed: {metrics.files_processed}")
print(f"Total pages: {metrics.total_pages}")
print(f"Total characters: {metrics.total_characters}")
print(f"Average confidence: {metrics.average_confidence:.2%}")
print(f"Total processing time: {metrics.processing_time_ms:.2f}ms")
print(f"Provider: {metrics.provider}")
# Reset metrics for new batch
ocr.reset_metrics()
Analyzing Performance
Use metrics to analyze and optimize OCR performance across different providers and configurations.Copy
from upsonic.ocr import OCR
from upsonic.ocr.layer_1.engines import EasyOCREngine, RapidOCREngine, TesseractOCREngine
def benchmark_providers(file_path):
"""Compare performance of different OCR providers."""
providers = [
('EasyOCR', EasyOCREngine(languages=['en'], gpu=False)),
('RapidOCR', RapidOCREngine(languages=['en'])),
('Tesseract', TesseractOCREngine(languages=['eng']))
]
results = {}
for name, engine in providers:
ocr = OCR(layer_1_ocr_engine=engine)
ocr.reset_metrics()
# Process file
result = ocr.process_file(file_path)
# Get metrics
metrics = ocr.get_metrics()
results[name] = {
'confidence': result.confidence,
'processing_time_ms': result.processing_time_ms,
'characters': len(result.text)
}
# Print comparison
print("Provider Performance Comparison:")
for name, data in results.items():
print(f"\n{name}:")
print(f" Confidence: {data['confidence']:.2%}")
print(f" Time: {data['processing_time_ms']:.2f}ms")
print(f" Characters: {data['characters']}")
return results
# Run benchmark
benchmark_providers('test_document.pdf')

