[Python][Google Cloud Vision API]画像ファイルを光学式文字認識（OCR）を使って文章抽出する

Google Cloud Vision APIを使って、OCRによる文章抽出を行いました。

そもそもは、PDFの文章抽出を行おうとしており、PDFファイルそのものをOCRにかけることはできないため、
PDFからPNGファイルに変換して、画像ファイルをOCRにかけて抽出しました。

PDFからPNGへの変換は別記事にまとめました。
https://blog.integrityworks.co.jp/2019/11/01/python-pdf-change-to-png/

Google Cloud Vision APIの公式情報
Cloud Vision APIを使うための流れ
サンプルコード
参考

Google Cloud Vision APIの公式情報

Cloud Vision APIについて
https://cloud.google.com/vision/

Vision API のデモ
https://cloud.google.com/vision/#section-2

Vision API の料金
https://cloud.google.com/vision/pricing
DOCUMENT_TEXT_DETECTION (テキスト検出)が今回の対象です。
最初の 1,000 ユニット/月までは、無料のようです。（2019年時点）

Cloud Vision APIを使うための流れ

Vision API を設定

Cloud Vision setup and cleanup | Cloud Vision API | Google Cloud
cloud.google.com

個人的には、初めてだとこのあたりの設定が大変なのですが、解りやすく説明できないです。。。

ここの手順の中で、[サービスアカウントキーの作成]から、キーファイルを生成します。
"Google Cloudのproject name"-"xxxxxxx".json　というような名前だったので、サンプルコードでは"credentials.json"という名前に変更しています。

サンプルコード

OCRにかけた後は、抽出した文章をテキストファイルに、
元画像にブロックの範囲で枠付けした画像をそれぞれ抽出するようにしています。

import cv2
import glob
import io
import os
import shutil
from google.cloud import vision
from google.cloud.vision import types

from google.cloud.vision_v1 import ImageAnnotatorClient
from google.cloud.vision_v1.proto.image_annotator_pb2 import AnnotateImageResponse, AnnotateImageRequest

output_dir = "output_gcv"
output_image_dir = output_dir + "/output_img"
input_image_dir = "Intermediate_img"

# import gpyocr
#
# # text, conf = gpyocr.tesseract_ocr('test.png', lang='jpn', psm=6)
# # print(text)
# aaa, confidence = gpyocr.google_vision_ocr('test.png', langs=['ja'])

# 結果の出力用ディレクトリが存在していれば、クリアして再生成する
def output_setting():
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    if os.path.exists(output_image_dir):
        shutil.rmtree(output_image_dir)

    os.makedirs(output_dir)
    os.makedirs(output_image_dir)

# cloud vision apiを使って、抽出する
def get_page_document(file_name):
    with io.open(file_name, 'rb') as image_file:
        content = image_file.read()

    image = types.Image(content=content)
    # 言語のヒントを与える
    image_context = types.ImageContext(language_hints=['ja'])

    response: AnnotateImageResponse = client.document_text_detection(image=image)
    # response = client.annotate_image({
    #     'image': {'content': content},
    #     'features': [{
    #         'type': vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION
    #     }],
    #     'image_context': {'language_hints': ['jp']}
    # })

    name, ext = os.path.splitext(os.path.basename(file_name))
    name_slice = name[0:len(name) - 2]
    # 別ページのテキストも一つのファイルにすべてまとめる
    with open(output_dir + "/res_gcv_text_" + name_slice + ".txt", 'a', encoding='utf-8') as file_descriptor_txt:
        file_descriptor_txt.write(response.full_text_annotation.text)
        # ページの切り替わり
        # file_descriptor_txt.write("-----------\r")

    # 各ページごとにjson形式のデータを出力する
    with open(output_dir + "/res_gcv_json_" + name + ".txt", 'w', encoding='utf-8') as file_descriptor_json:
        file_descriptor_json.write(str(response.full_text_annotation))

    doc: AnnotateImageResponse.full_text_annotation = response.full_text_annotation
    return doc

# 視覚的なわかりやすさのため、文章ブロック単位で、枠を付ける
def analyze_document(file_name):
    out = cv2.imread(file_name)
    doc = get_page_document(file_name)

    # full_text_annotation -> Page -> Block -> Paragraph -> Word ->Symbols.
    for page in doc.pages:
        for block_index, block in enumerate(page.blocks):
            # 赤
            cv2.rectangle(out, (block.bounding_box.vertices[0].x, block.bounding_box.vertices[0].y),
                          (block.bounding_box.vertices[2].x, block.bounding_box.vertices[2].y), (0, 0, 255),
                          thickness=10)

            block_sentence = ""
            for paragraph in block.paragraphs:
                sentence = ""
                for word in paragraph.words:
                    word_text = ''.join([symbol.text for symbol in word.symbols])
                    sentence = sentence + word_text
                block_sentence = block_sentence + sentence

#            print('Block\t{}\t{}\t{}\t{}\t{}'.format(block_index, block.confidence, block_sentence,
#                                                     (block.bounding_box.vertices[0].x,
#                                                      block.bounding_box.vertices[0].y),
#                                                     (block.bounding_box.vertices[2].x,
#                                                      block.bounding_box.vertices[2].y)))

            for para_index, paragraph in enumerate(block.paragraphs):
                # 緑
                cv2.rectangle(out, (paragraph.bounding_box.vertices[0].x, paragraph.bounding_box.vertices[0].y),
                              (paragraph.bounding_box.vertices[2].x, paragraph.bounding_box.vertices[2].y), (0, 255, 0),
                              thickness=5)

                sentence = ""
                for word in paragraph.words:
                    word_text = ''.join([symbol.text for symbol in word.symbols])
                    sentence = sentence + word_text
#                print(
#                    'Paragraph\t{}\t{}\t{}'.format(str(block_index) + "_" + str(para_index), paragraph.confidence,
#                                                   sentence))

                for word in paragraph.words:
                    # 青
                    cv2.rectangle(out, (word.bounding_box.vertices[0].x, word.bounding_box.vertices[0].y),
                                  (word.bounding_box.vertices[2].x, word.bounding_box.vertices[2].y), (255, 0, 0),
                                  thickness=1)
                    word_text = ''.join([symbol.text for symbol in word.symbols])
#                    print('Word\t{}\t{}\t{}'.format("", word.confidence, word_text))

                    # for symbol in word.symbols:
                    #     print('\tdebug: Symbol: {} (confidence: {})'.format(
                    #         symbol.text, symbol.confidence))

    output_img_name = output_image_dir + "/" + os.path.basename(file_name)
    cv2.imwrite(output_img_name, out)

if __name__ == '__main__':
    output_setting()
    image_list = glob.glob(input_image_dir + "/*.png")

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "credentials.json"
    client: ImageAnnotatorClient = vision.ImageAnnotatorClient()

    for file_name in image_list:
        analyze_document(file_name)

参考

https://googleapis.github.io/google-cloud-python/latest/vision/gapic/v1/api.html