import re from reportlab import platypus from reportlab.lib.pagesizes import letter, A4 from reportlab.lib.units import inch from reportlab.platypus import SimpleDocTemplate, Image from reportlab.pdfgen import canvas from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.cidfonts import UnicodeCIDFont pdfmetrics.registerFont(UnicodeCIDFont('STSong-Light'))
def write_text(line_dict, w_ratio, h_ratio, height, c, is_table=False): text = line_dict['objContent']
#if is_table:
# 因为加了表格里面的文本行后加了冒号
#text = text[:-1]
y, x, h, w, position = line_dict['objPos']
#print("textlen",len(text),"text",text)
#print("poslen",len(position),"position",position)
text_x = x / w_ratio
text_y = y / h_ratio
text_h = h / h_ratio
text_w = w / w_ratio
for index, t in enumerate(list(text)):
#print("index---",index)
font_size = text_h - 2
c.setFont('STSong-Light', font_size)
if index == 0:
offset = 0
else:
offset = position[index] / w_ratio - 3
c.drawString(text_x + offset, height - text_y - text_h + 2, text[index], mode=0) # mode=3 文字不可见 0可見
def gen_pdf2(images_path, json_path, gen_pdf2_path): ''' input: images_path: 图片所在文件夹路径 json_path:文本识别后输出的文本json-- ocr识别后输出的json文件 gen_pdf2_path:图片和json文件生成的双层PDF所在文件夹路径 '''
# 读取json
json_list = json.load(open(json_path, 'r', encoding='utf-8'))
c = canvas.Canvas(gen_pdf2_path, pagesize=A4)
list_dirs = os.listdir(images_path)
list_dirs.sort(key=lambda x: int(re.findall(r"\d+", x)[-1]))
# 读取图片
for index, img_file in enumerate(list_dirs):
img_path = os.path.join(images_path, img_file)
img = cv2.imread(img_path)
img_h, img_w = img.shape[0], img.shape[1]
# 获取图片在文档里是第几页
try:
page_index = int(img_file.rsplit('.', 1)[0].rsplit('_', 1)[1])
if page_index != index:
page_index = index
except:
page_index = index
width, height = A4
# print("letter's width, height: ", width, height)
# print("img's width, height: ", img_w, img_h)
w_ratio = img_w / width
h_ratio = img_h / height
c.setFillColorRGB(255, 0, 0)
c.drawInlineImage(img_path, 0, 0, width, height)
# 循环写文本到图片
try:
line_lists = json_list[page_index]['lineList']
for line_dict in line_lists:
if line_dict['objType'] == "textLine":
write_text(line_dict, w_ratio, h_ratio, height, c, is_table=False)
if line_dict['objType'] == "table":
table_list = line_dict['objContent']
for table_dict in table_list:
write_text(table_dict, w_ratio, h_ratio, height, c, is_table=True)
c.showPage()
except Exception as e:
print(e)
c.save()