Extracted image has visibly worse resolution #3597

aleblanc30 · 2024-06-19T16:40:48Z

aleblanc30
Jun 19, 2024

Hi ! I'm looking to extract image data from a pdf.

I wrote a script that goes like this :

for img in page.get_images(full=True):
    xref = img[0]
    pix = pymupdf.Pixmap(doc.extract_image(xref)['image'])
    mode = "RGB"
    raster = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)

however, the image I get is visibly of lower resolution than the one in the pdf. I know this because the image is made of rectangles, which are sharp in the pdf but are blurred in the extracted image.

Most questions asked online about resolution are about the resolution of the image of a whole page, obtained with page.get_pixmap, and are solved with the dpi or transform arguments. This does not seem to apply to my issue.

Any help would be appreciated.

aleblanc30 · 2024-07-01T13:48:08Z

aleblanc30
Jul 1, 2024
Author

Hi again,

I may have not provided enough information in my initial post, so let me update it with my script and the pdf I am working with.

If someone wants to engage with this question but lacks information, please, ask me to expand, I'll gladly do so as I need the help :-)

to_extract.pdf

import pymupdf
import os
import matplotlib.pyplot as plt
from PIL import Image, ImageTk
from pprint import pprint
from math import sqrt
import numpy as np
from copy import deepcopy

dir = os.path.dirname(__file__)
fn = os.path.join(dir, 'to_extract.pdf')

doc = pymupdf.open(fn)
page = doc.load_page(0)

# extract scale ticks
ticks = []
for drawing in page.get_drawings():
    items = drawing['items']
    if len(items) == 1:
        item = items[0]
        if len(item) == 3 and item[0]=='l' and item[1].y ==item[2].y:
            p1, p2 = item[1], item[2]
            norm = sqrt((p1.x-p2.x)**2+(p1.y-p2.y)**2)
            if norm < .65:
                ticks.append(drawing)

# extract rasters, colorbars and scale labels
scale_bars = []
labels = []
rasters = []
others = []
page.clean_contents()

for img in page.get_images(full=True):
    xref = img[0]
    smask = img[1]
    try:
        bbox, transform=page.get_image_bbox(img, transform=True)
    except:
        continue
    img = doc.extract_image(xref)
    img['xref'] = xref
    img['smask'] = smask
    img['bbox'] = bbox
    img['transform'] = transform
    w, h = img['width'], img['height']

    if h > 120:
        if w < 100:
            scale_bars.append(img)
        elif w > 160:
            rasters.append(img)
        else:
            others.append(img)
    elif w<=10 and h <=20:
        x, y = bbox.x1, .5*(bbox.y1+bbox.y0)
        dmin, imin = 1e10,-1
        for i, t in enumerate(ticks):
            p1, p2 = t['items'][0][1:]
            d = sqrt((p1.x-x)**2 + (p1.y-y)**2)
            if d <dmin:
                dmin, imin = d, i
        if dmin < 3:
            ticks[imin]['label_img'] = img
            labels.append(img)
    else:
        others.append(img)

# run ocr on ticks
for tick in ticks:
    if not('label_img' in tick):
        continue
    img = tick['label_img']
    pix = pymupdf.Pixmap(doc.extract_image(img['xref'])['image'])

    mode = "RGBA" if pix.alpha else "RGB"
    img_ = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)

    samples = bytearray(img_.tobytes())    # get plain pixel data from numpy array
    pix = pymupdf.Pixmap(pymupdf.csRGB, img_.shape[1], img_.shape[0], samples, False)
 
    temp_doc = pymupdf.open("pdf", pix.pdfocr_tobytes())
    text = temp_doc[0].get_text()
    if text != '':
        tick['label'] = max(0, min(int(text.strip(' \n').replace('O', '0')), 3))


# attribute ticks to scale_bars
for scale_bar in scale_bars:
    scale_bar['ticks'] = []
    scale_bar['ticklabels'] = []
    for tick in ticks:
        ymin, ymax = scale_bar['bbox'].y0, scale_bar['bbox'].y1
        ymin, ymax = min(ymin, ymax), max(ymin, ymax)
        scale_x = scale_bar['bbox'].x1
        p2 = tick['items'][0][2]
        d2 = (scale_x-p2.x)**2
        if p2.y < ymin or p2.y > ymax:
            d2 += max((p2.y - ymin)**2, (p2.y - ymax)**2)
        d = sqrt(d2)
        if d < 10:
            p1 = tick['items'][0][1]
            scale_bar['ticks'].append((ymax - p1.y)/(ymax-ymin))
            scale_bar['ticklabels'].append(tick.get('label', None))
        
class Graph:
    def __init__(self, raster):
        self.raster = raster
        self.x = self.raster['bbox'].x1
        self.y = .5*(self.raster['bbox'].y0+self.raster['bbox'].y1)

graphs = [Graph(r) for r in rasters]
graphs = sorted(graphs, key=lambda g: g.y)
graphs = [graphs[:4], graphs[4:8], graphs[8:12], graphs[12:16], graphs[16:20], graphs[20:]]
graphs = [sorted(gr, key=lambda g:g.x) for gr in graphs]

# attribute scales to each graph
graphs_with_scales = []
for scale_bar in scale_bars:
    x, y = scale_bar['bbox'].x0, .5*(scale_bar['bbox'].y0+scale_bar['bbox'].y1)
    imin, jmin, dmin = 1000, 1000, 1e5
    for i, row in enumerate(graphs):
        for j, g in enumerate(row):
            d = sqrt((x-g.x)**2+(y-g.y)**2)
            if d < dmin:
                dmin = d
                imin = i
                jmin = j
    graphs[imin][jmin].scale_bar = scale_bar

# handle misplaced scale bars : attribute bars and update bboxes
missing_correct = {(2,0):(2,1),
                   (0,2):(1,3),
                   (1,2):(1,3),
                   (0,3):(1,3),
                   (4,0):(3,0),
                   (4,1):(3,1),
                   (5,1):(3,1)}
for missing, correct in missing_correct.items():
    i, j = missing
    g = graphs[i][j]
    g.scale_bar = deepcopy(graphs[correct[0]][correct[1]].scale_bar)
    bbox = g.scale_bar['bbox']
    w, h = bbox.x1-bbox.x0, bbox.y1-bbox.y0
    x0 = g.raster['bbox'].x1+ 1
    y0 = g.raster['bbox'].y0
    bbox.x0 = x0
    bbox.y0 = y0
    bbox.x1 = x0 + w
    bbox.y1 = y0 + h


# check consistency of OCR ticks and attribute labels to other ticks
for i, r in enumerate(graphs):
    for j, g in enumerate(r):
        sb = g.scale_bar
        ticks = [[t, tl] for t, tl in zip(sb['ticks'], sb['ticklabels'])]
        ticks = sorted(ticks, key=lambda t: t[0])
        for l in range(5):
            for k in range(1, len(ticks)):
                if ticks[k][1] is None and ticks[k-1][1] is not None:
                    ticks[k][1] = ticks[k-1][1]+1
            for k in range(len(ticks)-1):
                if ticks[k][1] is None and ticks[k+1][1] is not None:
                    ticks[k][1] = ticks[k+1][1]-1
        sb['ticks'] = [t[0] for t in ticks]
        sb['ticklabels'] = [t[1] for t in ticks]
        if None in sb['ticklabels']:
            sb['ticklabels'] = [-1, 0, 1, 2, 3]

# create plot to ascertain that everything is in order
plot = False
if plot:
    import matplotlib.patches as patches
    plt.figure()
    ax = plt.axes()
    ax.set_xlim(200,600)
    ax.set_ylim(50,500)
    ax.set_xlim(-100,1000)
    ax.set_ylim(-100,1000)
    ax.invert_yaxis()
    c = ['r', 'g', 'b', 'c', 'y', 'm']
    rects = []
    from random import random
    for i, r in enumerate(graphs):
        for j, g in enumerate(r):
            bbox=g.raster['bbox']
            x, y = bbox.x1, .5*(bbox.y0+bbox.y1)
            # ax.plot(x, y, 'ok')
            rect = patches.Rectangle((bbox.x0, bbox.y0), bbox.x1-bbox.x0, bbox.y1-bbox.y0, fc=c[(i+j)%6], fill=True, visible=True)
            ax.add_artist(rect)

            bbox=g.scale_bar['bbox']
            x, y = bbox.x0, .5*(bbox.y0+bbox.y1)
            # ax.plot(x, y, 'or')
            x0, y0 = (bbox.x0, bbox.y0)
            w, h = bbox.x1-bbox.x0, bbox.y1-bbox.y0
            pix = pymupdf.Pixmap(doc.extract_image(g.scale_bar['xref'])['image'])
            mask = pymupdf.Pixmap(doc.extract_image(g.scale_bar['smask'])['image'])
            pix = pymupdf.Pixmap(pix, mask)
            mode = "RGBA" if pix.alpha else "RGB"
            img_ = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
            ax.imshow(img_, extent=[x0, x0+w, y0, y0+h], origin='lower')
            # rect = patches.Rectangle((x0,y0), w, h, fc=c[(i+j)%6], fill=True, visible=True)
            ax.add_artist(rect)
            xticks, yticks = [],[]
            r = random()
            for tick, lbl in zip(g.scale_bar['ticks'], g.scale_bar['ticklabels']):
                x, y = x0+.5*w, bbox.y1-h*tick
                ax.text(x, y, lbl)
                xticks.append(x)
                yticks.append(y)
            plt.plot(xticks, yticks, 'ok')
    plt.show()

# extract timeseries
from scipy.interpolate import interp1d, RegularGridInterpolator
for i, r in enumerate(graphs):
    for j, g in enumerate(r):
        pix = pymupdf.Pixmap(doc.extract_image(g.scale_bar['xref'])['image'])
        mode = "RGB"
        scale = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)
        scale = scale[::-1,5,:]
        print(g.raster)
        pix = pymupdf.Pixmap(doc.extract_image(g.raster['xref'])['image'])
        mode = "RGB"
        # raster = np.array(Image.frombytes(mode, [pix.width, pix.height], pix.samples), dtype=np.uint8)
        import io
        raster = np.array(Image.open(io.BytesIO(g.raster['image'])), dtype=np.uint8)
        print(raster.shape)
        red = RegularGridInterpolator((np.linspace(0,1,raster.shape[0]), np.linspace(0,1,raster.shape[1])), raster[:,:,0])
        green = RegularGridInterpolator((np.linspace(0,1,raster.shape[0]), np.linspace(0,1,raster.shape[1])), raster[:,:,1])
        blue = RegularGridInterpolator((np.linspace(0,1,raster.shape[0]), np.linspace(0,1,raster.shape[1])), raster[:,:,2])

        ni = 19
        si = np.linspace(0,1,ni)
        si = .022*si+.97*(1-si)
        nj = 45
        sj = np.linspace(0,1,nj)
        sj = .01*sj+.985*(1-sj)


        _, axs = plt.subplots(1,2)
        axs[0].imshow(raster, extent=[0,1,0,1])
        raster = np.empty((ni, nj, 3), dtype=np.uint8)
        i, j = np.meshgrid(si[::-1], sj[::-1], indexing='ij')
        axs[0].plot(j.flatten(), i.flatten(), 'or')
        print(i.shape, j.shape, raster.shape)
        raster[:,:,0] = red((i,j))
        raster[:,:,1] = green((i,j))
        raster[:,:,2] = blue((i,j))
        axs[1].imshow(raster, extent=[0,1,0,1])
        plt.show()

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Extracted image has visibly worse resolution #3597

{{title}}

Replies: 1 comment

{{title}}

Select a reply

Extracted image has visibly worse resolution #3597

aleblanc30 Jun 19, 2024

Replies: 1 comment

aleblanc30 Jul 1, 2024 Author

aleblanc30
Jun 19, 2024

aleblanc30
Jul 1, 2024
Author