import zipfile
import PIL
from PIL import Image
import pytesseract
import cv2 as cv
import numpy as np
# loading the face detection classifier
face_cascade = cv.CascadeClassifier('readonly/haarcascade_frontalface_default.xml')
def newspaper_info(link_zip):
newspaper = []
with zipfile.PyZipFile(link_zip) as archive:
for entry in archive.infolist():
with archive.open(entry) as file:
page_info = {}
page_info['file_name'] = entry.filename
print('Analysing in {}'.format(entry.filename))
img = Image.open(file)
page_info['image'] = img
img = img.convert('L')
page_info['text'] = pytesseract.image_to_string(img)
open_cv_image = np.array(img.convert('RGB'))
open_cv_image = open_cv_image[:, :, ::-1].copy()
gray = cv.cvtColor(open_cv_image, cv.COLOR_BGR2GRAY)
page_info['gray'] = gray
# page_info['bounding_boxes'] = face_cascade.detectMultiScale(gray, 1.35).tolist()
newspaper.append(page_info)
print('Analysing fiinished!')
return newspaper
def text_search_and_img_display(text_search, level_dectection, newspaper):
for page in newspaper:
bounding_boxes = list(face_cascade.detectMultiScale(page['gray'], level_dectection, 5))
if text_search in page['text']:
print('Results found in file {}'.format(page['file_name']))
if len(bounding_boxes) == 0:
print('But there were no faces in that file!')
else:
rows_size = (len(bounding_boxes) - 1)//5 + 1
first_image = page['image'].crop((0, 0, 100, 100))
first_image.thumbnail((100, 100))
contact_sheet= Image.new(first_image.mode, (first_image.width*5,first_image.height*rows_size))
x=0
y=0
for iboxes in bounding_boxes:
box = iboxes @ np.array([[1, 0, 1, 0], [0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]])
img = page['image'].crop(box)
img.thumbnail((100, 100), Image.ANTIALIAS)
contact_sheet.paste(img, (x, y) )
if x+first_image.width == contact_sheet.width:
x=0
y=y+first_image.height
else:
x=x+first_image.width
display(contact_sheet)
return None
my_newspaper = newspaper_info('./readonly/small_img.zip')
text_search_and_img_display('Christophe', 1.3, my_newspaper)
my_newspaper2 = newspaper_info('./readonly/images.zip')
text_search_and_img_display('Mark', 1.3, my_newspaper2)