I'm trying to run the Azure OCR API on 6000+ images. Unfortunately the code stalls after just 90 images.
Documentation:
- https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/quickstarts/python-print-text#prerequisites
- https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/quickstarts/python-disk
Input: 6000+ Images (.png)
Desired Output:
- File with extracted text only
- File with extracted text and their corresponding bounding boxes
Error Msg: ConnectionError: HTTPSConnectionPool(host='westcentralus.api.cognitive.microsoft.com', port=443): Max retries exceeded with url: /vision/v2.0/ocr?language=unk&detectOrientation=true (Caused by NewConnectionError(': Failed to establish a new connection: [Errno -2] Name or service not known',))
I've provided a delay of 60 seconds after every 10 images, which should ideally take care of the 20 transactions per minute quota.
import warnings
warnings.filterwarnings("ignore")
import glob
import os
import requests
import pandas as pd
import time
# Replace the value of subscription_key with your subscription key.
subscription_key = "{key}"
assert subscription_key
# Replace the value of vision_base_url (not necessary for trial version)
vision_base_url="https://westcentralus.api.cognitive.microsoft.com/vision/v2.0/"
analyze_url = vision_base_url + "ocr"
# Initializing Source and Output Directories
source_directory = glob.glob('folder/with/6000/images/*.png')
output_directory_textFiles = 'folder/for/saving/6000/textFiles/'
output_directory_JSONFiles = 'folder/for/saving/6000/JSONFiles/'
if not os.path.exists(output_directory_textFiles):
os.makedirs(output_directory_textFiles)
if not os.path.exists(output_directory_JSONFiles):
os.makedirs(output_directory_JSONFiles)
# Define Function for Extracting Text
def extract_text(image_path):
# Read the image into a byte array
image_data = open(image_path, "rb").read()
headers = {'Ocp-Apim-Subscription-Key': subscription_key,'Content-Type': 'application/octet-stream'}
params = {'language': 'unk', 'detectOrientation': 'true'}
response = requests.post(analyze_url, headers=headers, params=params, data=image_data)
analysis = response.json()
# Extract the word bounding boxes and text.
line_infos = [region["lines"] for region in analysis["regions"]]
word_infos = []
for line in line_infos:
for word_metadata in line:
for word_info in word_metadata["words"]:
word_infos.append(word_info)
return(word_infos)
# Generating Text and JSON Files
counter = 0
for image in sorted(source_directory):
counter += 1
print(r'Processing %d %s' %(counter, image))
word_infos = extract_text(image)
filename = image.split('/')[-1].replace('.png', '')
if len(word_infos) != 0:
bboxOutput = pd.DataFrame(word_infos)
bboxOutput[['x','y', 'width','height']] = bboxOutput['boundingBox'].str.split(',',expand=True)
bboxOutput = bboxOutput.drop(['boundingBox'], axis=1)
textFile = bboxOutput['text']
textFile = textFile.to_csv(r'{}/{}.txt'.format(output_directory_textFiles, filename), header = False, index = None, sep = ',')
jsonFile = bboxOutput.to_json(orient = 'records')
with open(r'{}/{}.txt'.format(output_directory_JSONFiles, filename), 'w') as f:
f.write(jsonFile)
f.close()
else:
word_infos = pd.DataFrame(word_infos)
textFile = word_infos.to_csv(r'{}/{}.txt'.format(output_directory_textFiles, filename), header = False, index = None, sep = ',')
jsonFile = word_infos.to_json(orient = 'records')
with open(r'{}/{}.txt'.format(output_directory_JSONFiles, filename), 'w') as f:
f.write(jsonFile)
f.close()
if (counter % 10) == 0:
time.sleep(60)
else:
pass