try: from PIL import Image except ImportError: import Image import pytesseract FILENAME = 'test.jpg' text = pytesseract.image_to_string(Image.open(FILENAME)) lines = text.split("\n") for line in lines: if '<<<' in line: t = line.replace(' ', '') print(t)
test1.png P<GBRUK<SPECIMEN<<ANGELA<ZOE<<<<<<<<<<<<<<<< 5334755143GBR8812049F2509286<<<<<<<<<<c<<<04 test2.png PDCYPPOLITIS<<ZINONAS<<<<<<<<<<<<<<KKKKKKKKK FOOOOOD005CYP8012148M3006151<<<<<<<<<<<<<<02 test3.png PTNOROESTENBYEN<<AASAMUND<SPECIMEN<<<<<<<<<< FHCO023539NOR5604230M2506126<<<<<<<<<<<<<<00
なぜか変な文字列が混ざりますね
def convert(Filename): img = Image.open(Filename) img=img.convert('RGB') size=img.size img2=Image.new('RGB',size) border=110 for x in range(size[0]): for y in range(size[1]): r,g,b=img.getpixel((x,y)) if r > border or g > border or b > border: r = 255 g = 255 b = 255 img2.putpixel((x,y),(r,g,b)) return img2 text = pytesseract.image_to_string(convert('test.jpg')) lines = text.split("\n") for line in lines: if '<<<' in line: print(line)
うーむ、ちょっと違うかな