huydungasd.github.io

import zipfile import PIL from PIL import Image import pytesseract import cv2 as cv import numpy as np # loading the face detection classifier face_cascade = cv.CascadeClassifier('readonly/haarcascade_frontalface_default.xml')

def newspaper_info(link_zip): newspaper = [] with zipfile.PyZipFile(link_zip) as archive: for entry in archive.infolist(): with archive.open(entry) as file: page_info = {} page_info['file_name'] = entry.filename print('Analysing in {}'.format(entry.filename)) img = Image.open(file) page_info['image'] = img img = img.convert('L') page_info['text'] = pytesseract.image_to_string(img) open_cv_image = np.array(img.convert('RGB')) open_cv_image = open_cv_image[:, :, ::-1].copy() gray = cv.cvtColor(open_cv_image, cv.COLOR_BGR2GRAY) page_info['gray'] = gray # page_info['bounding_boxes'] = face_cascade.detectMultiScale(gray, 1.35).tolist() newspaper.append(page_info) print('Analysing fiinished!') return newspaper

def text_search_and_img_display(text_search, level_dectection, newspaper): for page in newspaper: bounding_boxes = list(face_cascade.detectMultiScale(page['gray'], level_dectection, 5)) if text_search in page['text']: print('Results found in file {}'.format(page['file_name'])) if len(bounding_boxes) == 0: print('But there were no faces in that file!') else: rows_size = (len(bounding_boxes) - 1)//5 + 1 first_image = page['image'].crop((0, 0, 100, 100)) first_image.thumbnail((100, 100)) contact_sheet= Image.new(first_image.mode, (first_image.width*5,first_image.height*rows_size)) x=0 y=0 for iboxes in bounding_boxes: box = iboxes @ np.array([[1, 0, 1, 0], [0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]]) img = page['image'].crop(box) img.thumbnail((100, 100), Image.ANTIALIAS) contact_sheet.paste(img, (x, y) ) if x+first_image.width == contact_sheet.width: x=0 y=y+first_image.height else: x=x+first_image.width display(contact_sheet) return None

my_newspaper = newspaper_info('./readonly/small_img.zip')

text_search_and_img_display('Christophe', 1.3, my_newspaper)

my_newspaper2 = newspaper_info('./readonly/images.zip')

text_search_and_img_display('Mark', 1.3, my_newspaper2)