Extracted image has visibly worse resolution #3597
Unanswered
aleblanc30
asked this question in
Looking for help
Replies: 1 comment
-
Hi again, I may have not provided enough information in my initial post, so let me update it with my script and the pdf I am working with. If someone wants to engage with this question but lacks information, please, ask me to expand, I'll gladly do so as I need the help :-) import pymupdf
import os
import matplotlib.pyplot as plt
from PIL import Image, ImageTk
from pprint import pprint
from math import sqrt
import numpy as np
from copy import deepcopy
dir = os.path.dirname(__file__)
fn = os.path.join(dir, 'to_extract.pdf')
doc = pymupdf.open(fn)
page = doc.load_page(0)
# extract scale ticks
ticks = []
for drawing in page.get_drawings():
items = drawing['items']
if len(items) == 1:
item = items[0]
if len(item) == 3 and item[0]=='l' and item[1].y ==item[2].y:
p1, p2 = item[1], item[2]
norm = sqrt((p1.x-p2.x)**2+(p1.y-p2.y)**2)
if norm < .65:
ticks.append(drawing)
# extract rasters, colorbars and scale labels
scale_bars = []
labels = []
rasters = []
others = []
page.clean_contents()
for img in page.get_images(full=True):
xref = img[0]
smask = img[1]
try:
bbox, transform=page.get_image_bbox(img, transform=True)
except:
continue
img = doc.extract_image(xref)
img['xref'] = xref
img['smask'] = smask
img['bbox'] = bbox
img['transform'] = transform
w, h = img['width'], img['height']
if h > 120:
if w < 100:
scale_bars.append(img)
elif w > 160:
rasters.append(img)
else:
others.append(img)
elif w<=10 and h <=20:
x, y = bbox.x1, .5*(bbox.y1+bbox.y0)
dmin, imin = 1e10,-1
for i, t in enumerate(ticks):
p1, p2 = t['items'][0][1:]
d = sqrt((p1.x-x)**2 + (p1.y-y)**2)
if d <dmin:
dmin, imin = d, i
if dmin < 3:
ticks[imin]['label_img'] = img
labels.append(img)
else:
others.append(img)
# run ocr on ticks
for tick in ticks:
if not('label_img' in tick):
continue
img = tick['label_img']
pix = pymupdf.Pixmap(doc.extract_image(img['xref'])['image'])
mode = "RGBA" if pix.alpha else "RGB"
img_ = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)
samples = bytearray(img_.tobytes()) # get plain pixel data from numpy array
pix = pymupdf.Pixmap(pymupdf.csRGB, img_.shape[1], img_.shape[0], samples, False)
temp_doc = pymupdf.open("pdf", pix.pdfocr_tobytes())
text = temp_doc[0].get_text()
if text != '':
tick['label'] = max(0, min(int(text.strip(' \n').replace('O', '0')), 3))
# attribute ticks to scale_bars
for scale_bar in scale_bars:
scale_bar['ticks'] = []
scale_bar['ticklabels'] = []
for tick in ticks:
ymin, ymax = scale_bar['bbox'].y0, scale_bar['bbox'].y1
ymin, ymax = min(ymin, ymax), max(ymin, ymax)
scale_x = scale_bar['bbox'].x1
p2 = tick['items'][0][2]
d2 = (scale_x-p2.x)**2
if p2.y < ymin or p2.y > ymax:
d2 += max((p2.y - ymin)**2, (p2.y - ymax)**2)
d = sqrt(d2)
if d < 10:
p1 = tick['items'][0][1]
scale_bar['ticks'].append((ymax - p1.y)/(ymax-ymin))
scale_bar['ticklabels'].append(tick.get('label', None))
class Graph:
def __init__(self, raster):
self.raster = raster
self.x = self.raster['bbox'].x1
self.y = .5*(self.raster['bbox'].y0+self.raster['bbox'].y1)
graphs = [Graph(r) for r in rasters]
graphs = sorted(graphs, key=lambda g: g.y)
graphs = [graphs[:4], graphs[4:8], graphs[8:12], graphs[12:16], graphs[16:20], graphs[20:]]
graphs = [sorted(gr, key=lambda g:g.x) for gr in graphs]
# attribute scales to each graph
graphs_with_scales = []
for scale_bar in scale_bars:
x, y = scale_bar['bbox'].x0, .5*(scale_bar['bbox'].y0+scale_bar['bbox'].y1)
imin, jmin, dmin = 1000, 1000, 1e5
for i, row in enumerate(graphs):
for j, g in enumerate(row):
d = sqrt((x-g.x)**2+(y-g.y)**2)
if d < dmin:
dmin = d
imin = i
jmin = j
graphs[imin][jmin].scale_bar = scale_bar
# handle misplaced scale bars : attribute bars and update bboxes
missing_correct = {(2,0):(2,1),
(0,2):(1,3),
(1,2):(1,3),
(0,3):(1,3),
(4,0):(3,0),
(4,1):(3,1),
(5,1):(3,1)}
for missing, correct in missing_correct.items():
i, j = missing
g = graphs[i][j]
g.scale_bar = deepcopy(graphs[correct[0]][correct[1]].scale_bar)
bbox = g.scale_bar['bbox']
w, h = bbox.x1-bbox.x0, bbox.y1-bbox.y0
x0 = g.raster['bbox'].x1+ 1
y0 = g.raster['bbox'].y0
bbox.x0 = x0
bbox.y0 = y0
bbox.x1 = x0 + w
bbox.y1 = y0 + h
# check consistency of OCR ticks and attribute labels to other ticks
for i, r in enumerate(graphs):
for j, g in enumerate(r):
sb = g.scale_bar
ticks = [[t, tl] for t, tl in zip(sb['ticks'], sb['ticklabels'])]
ticks = sorted(ticks, key=lambda t: t[0])
for l in range(5):
for k in range(1, len(ticks)):
if ticks[k][1] is None and ticks[k-1][1] is not None:
ticks[k][1] = ticks[k-1][1]+1
for k in range(len(ticks)-1):
if ticks[k][1] is None and ticks[k+1][1] is not None:
ticks[k][1] = ticks[k+1][1]-1
sb['ticks'] = [t[0] for t in ticks]
sb['ticklabels'] = [t[1] for t in ticks]
if None in sb['ticklabels']:
sb['ticklabels'] = [-1, 0, 1, 2, 3]
# create plot to ascertain that everything is in order
plot = False
if plot:
import matplotlib.patches as patches
plt.figure()
ax = plt.axes()
ax.set_xlim(200,600)
ax.set_ylim(50,500)
ax.set_xlim(-100,1000)
ax.set_ylim(-100,1000)
ax.invert_yaxis()
c = ['r', 'g', 'b', 'c', 'y', 'm']
rects = []
from random import random
for i, r in enumerate(graphs):
for j, g in enumerate(r):
bbox=g.raster['bbox']
x, y = bbox.x1, .5*(bbox.y0+bbox.y1)
# ax.plot(x, y, 'ok')
rect = patches.Rectangle((bbox.x0, bbox.y0), bbox.x1-bbox.x0, bbox.y1-bbox.y0, fc=c[(i+j)%6], fill=True, visible=True)
ax.add_artist(rect)
bbox=g.scale_bar['bbox']
x, y = bbox.x0, .5*(bbox.y0+bbox.y1)
# ax.plot(x, y, 'or')
x0, y0 = (bbox.x0, bbox.y0)
w, h = bbox.x1-bbox.x0, bbox.y1-bbox.y0
pix = pymupdf.Pixmap(doc.extract_image(g.scale_bar['xref'])['image'])
mask = pymupdf.Pixmap(doc.extract_image(g.scale_bar['smask'])['image'])
pix = pymupdf.Pixmap(pix, mask)
mode = "RGBA" if pix.alpha else "RGB"
img_ = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
ax.imshow(img_, extent=[x0, x0+w, y0, y0+h], origin='lower')
# rect = patches.Rectangle((x0,y0), w, h, fc=c[(i+j)%6], fill=True, visible=True)
ax.add_artist(rect)
xticks, yticks = [],[]
r = random()
for tick, lbl in zip(g.scale_bar['ticks'], g.scale_bar['ticklabels']):
x, y = x0+.5*w, bbox.y1-h*tick
ax.text(x, y, lbl)
xticks.append(x)
yticks.append(y)
plt.plot(xticks, yticks, 'ok')
plt.show()
# extract timeseries
from scipy.interpolate import interp1d, RegularGridInterpolator
for i, r in enumerate(graphs):
for j, g in enumerate(r):
pix = pymupdf.Pixmap(doc.extract_image(g.scale_bar['xref'])['image'])
mode = "RGB"
scale = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)
scale = scale[::-1,5,:]
print(g.raster)
pix = pymupdf.Pixmap(doc.extract_image(g.raster['xref'])['image'])
mode = "RGB"
# raster = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)
import io
raster = np.array(Image.open(io.BytesIO(g.raster['image'])), dtype=np.uint8)
print(raster.shape)
red = RegularGridInterpolator((np.linspace(0,1,raster.shape[0]), np.linspace(0,1,raster.shape[1])), raster[:,:,0])
green = RegularGridInterpolator((np.linspace(0,1,raster.shape[0]), np.linspace(0,1,raster.shape[1])), raster[:,:,1])
blue = RegularGridInterpolator((np.linspace(0,1,raster.shape[0]), np.linspace(0,1,raster.shape[1])), raster[:,:,2])
ni = 19
si = np.linspace(0,1,ni)
si = .022*si+.97*(1-si)
nj = 45
sj = np.linspace(0,1,nj)
sj = .01*sj+.985*(1-sj)
_, axs = plt.subplots(1,2)
axs[0].imshow(raster, extent=[0,1,0,1])
raster = np.empty((ni, nj, 3), dtype=np.uint8)
i, j = np.meshgrid(si[::-1], sj[::-1], indexing='ij')
axs[0].plot(j.flatten(), i.flatten(), 'or')
print(i.shape, j.shape, raster.shape)
raster[:,:,0] = red((i,j))
raster[:,:,1] = green((i,j))
raster[:,:,2] = blue((i,j))
axs[1].imshow(raster, extent=[0,1,0,1])
plt.show() |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Hi ! I'm looking to extract image data from a pdf.
I wrote a script that goes like this :
however, the image I get is visibly of lower resolution than the one in the pdf. I know this because the image is made of rectangles, which are sharp in the pdf but are blurred in the extracted image.
Most questions asked online about resolution are about the resolution of the image of a whole page, obtained with
page.get_pixmap
, and are solved with thedpi
ortransform
arguments. This does not seem to apply to my issue.Any help would be appreciated.
Beta Was this translation helpful? Give feedback.
All reactions