Skip to content

Commit

Permalink
Save user uploads as WACZs
Browse files Browse the repository at this point in the history
  • Loading branch information
bensteinberg committed Dec 16, 2024
1 parent 189497f commit 844f189
Show file tree
Hide file tree
Showing 7 changed files with 573 additions and 44 deletions.
12 changes: 6 additions & 6 deletions perma_web/api/tests/test_link_authorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,13 @@ def test_should_allow_user_to_patch_with_file(self):
# capture were properly associated with actual web archive files, which is always
# the case outside of tests
self.link.archive_timestamp = timezone.now() + timedelta(1)
self.link.warc_size = 1
self.link.warc_size = 0
self.link.wacz_size = 1
self.link.save()

# This link has a warc and a wacz
# This link has a wacz and no warc
self.link.refresh_from_db()
self.assertTrue(self.link.warc_size)
self.assertFalse(self.link.warc_size)
self.assertTrue(self.link.wacz_size)

old_primary_capture = self.link.primary_capture
Expand All @@ -168,10 +168,10 @@ def test_should_allow_user_to_patch_with_file(self):

self.assertTrue(Capture.objects.filter(link_id=self.link.pk, role='primary').exclude(pk=old_primary_capture.pk).exists())

# This link now only has a warc, but not a wacz
# This link still only has a wacz
self.link.refresh_from_db()
self.assertTrue(self.link.warc_size)
self.assertFalse(self.link.wacz_size)
self.assertFalse(self.link.warc_size)
self.assertTrue(self.link.wacz_size)


def test_should_reject_patch_with_file_for_out_of_window_link(self):
Expand Down
6 changes: 3 additions & 3 deletions perma_web/api/tests/test_link_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def test_should_create_archive_from_pdf_file(self):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInArchive(link, upload=True, filetype='warc')
self.assertRecordsInArchive(link, upload=True, filetype='wacz')
self.assertEqual(link.primary_capture.user_upload, True)

def test_should_create_archive_from_jpg_file(self):
Expand All @@ -666,7 +666,7 @@ def test_should_create_archive_from_jpg_file(self):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInArchive(link, upload=True, filetype='warc')
self.assertRecordsInArchive(link, upload=True, filetype='wacz')
self.assertEqual(link.primary_capture.user_upload, True)

def test_should_reject_jpg_file_with_invalid_url(self):
Expand All @@ -687,7 +687,7 @@ def test_should_should_create_archive_from_jpg_file_with_nonloading_url(self):

link = Link.objects.get(guid=obj['guid'])
self.assertEqual(link.submitted_url, 'http://asdf.asdf')
self.assertRecordsInArchive(link, upload=True, filetype='warc')
self.assertRecordsInArchive(link, upload=True, filetype='wacz')
self.assertEqual(link.primary_capture.user_upload, True)

def test_should_reject_invalid_file(self):
Expand Down
26 changes: 17 additions & 9 deletions perma_web/perma/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,12 @@
first_day_of_next_month,
pp_date_from_post,
prep_for_perma_payments,
preserve_perma_warc,
preserve_perma_wacz,
process_perma_payments_transmission,
protocol,
remove_control_characters,
today_next_year,
tz_datetime,
write_resource_record_from_asset,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1976,7 +1975,7 @@ def get_pages_jsonl(self):

def write_uploaded_file(self, uploaded_file, cache_break=False):
"""
Given a file uploaded by a user, create a Capture record and warc.
Given a file uploaded by a user, create a Capture record and WACZ.
"""
from api.utils import get_mime_type, mime_type_lookup # local import to avoid circular import

Expand All @@ -1997,14 +1996,23 @@ def write_uploaded_file(self, uploaded_file, cache_break=False):
user_upload='True',
content_type=mime_type,
url=warc_url)
warc_size = [] # pass a mutable container to the context manager, so that it can populate it with the size of the finished warc
with preserve_perma_warc(self.guid, self.creation_timestamp, self.warc_storage_file(), warc_size) as warc:
uploaded_file.file.seek(0)
write_resource_record_from_asset(uploaded_file.file.read(), warc_url, mime_type, warc)

# make the WACZ
self.wacz_size = preserve_perma_wacz(
uploaded_file,
warc_url,
mime_type,
self.guid,
self.submitted_url,
self.submitted_title,
self.creation_timestamp,
self.wacz_storage_file()
)
self.warc_size = 0 # necessary?

self.captured_by_software = 'upload'
self.captured_by_browser = None
self.warc_size = warc_size[0]
self.save(update_fields=['captured_by_software', 'captured_by_browser', 'warc_size'])
self.save(update_fields=['captured_by_software', 'captured_by_browser', 'warc_size', 'wacz_size'])
capture.save()

def safe_delete_warc(self):
Expand Down
103 changes: 103 additions & 0 deletions perma_web/perma/templates/provenance-summary.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Provenance summary for user upload for {{url}}</title>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />

<style>
* {
padding: 0px;
margin: 0px;
box-sizing: border-box;
}

html {
font-size: 16px;
}

video {
max-width: 100%;
}

main {
width: 100%;
padding: 1rem;
margin: auto;
max-width: 90ch;
}

section {
margin-bottom: 1rem;
padding-top: 1rem;
border-top: 1px solid gray;
}

h1, h2 {
margin-bottom: 0.5rem;
}

p {
font-size: 1rem;
line-height: 1.5rem;
margin-bottom: 0.5rem;
}

p span {
display: inline-block;
background-color: antiquewhite;
padding: 0.2rem;
padding-left: 0.35rem;
padding-right: 0.35rem;
border-radius: 0.25rem;
}

ul {
list-style-position: inside;
}

table {
table-layout: fixed;
border-collapse: collapse;
width: 100%;
text-align: left;
}

table * {
word-break: break-word;
}

table tr {
border-bottom: 1px solid lightgray;
}

table tr td, table tr th {
padding: 0.75rem 0.25rem;
}

table tr td:first-of-type {
min-width: 34ch;
}

table tr:last-of-type {
border-bottom: 0px;
}
</style>

</head>

<body>

<main>

<header>
<h1>Provenance Summary</h1>
<p>The data present in this capture were uploaded by a Perma user to replace a failed or unsatisfactory capture of {{ url }} on {{ now }}.</p>
</header>

</main>

</body>

</html>

112 changes: 96 additions & 16 deletions perma_web/perma/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import string
import tempfile
from typing import Literal, TypeVar
import uuid
import unicodedata
from wsgiref.util import FileWrapper

Expand All @@ -33,6 +34,7 @@
JsonResponse,
StreamingHttpResponse,
)
from django.template import loader
from django.urls import reverse
from django.utils import timezone
from django.views.decorators.debug import sensitive_variables
Expand All @@ -43,6 +45,7 @@
import surt
import tempdir
from ua_parser import user_agent_parser
from wacz.main import create_wacz
from warcio.warcwriter import BufferWARCWriter

from perma.exceptions import (
Expand Down Expand Up @@ -498,26 +501,103 @@ def decrypt_from_perma_payments(ciphertext, encoder=encoding.Base64Encoder):
return box.decrypt(ciphertext, encoder=encoder)

#
# warc writing
# wacz writing
#

@contextmanager
def preserve_perma_warc(guid, timestamp, destination, warc_size):
def preserve_perma_wacz(uploaded_file, warc_url, mime_type, guid, url, title, timestamp, wacz_destination):
"""
Context manager for opening a perma warc, ready to receive warc records.
Safely closes and saves the file to storage when context is exited.
Creates and writes a perma WACZ for a user upload, returning the WACZ size.
This necessarily creates a WARC, but we no longer save it.
"""
# mode set to 'ab+' as a workaround for https://bugs.python.org/issue25341
out = tempfile.TemporaryFile('ab+')
write_perma_warc_header(out, guid, timestamp)
try:
yield out
finally:
out.flush()
warc_size.append(out.tell())
out.seek(0)
storages[settings.WARC_STORAGE].store_file(out, destination, overwrite=True)
out.close()
# this method of producing a timestamp string matches that in WACZ metadata
ts_string = timestamp.isoformat()[:-9] + "Z"

with tempfile.TemporaryDirectory() as tmpdir:
warc_file = f"{tmpdir}/data.warc.gz"
warc = open(warc_file, 'ab+')
write_perma_warc_header(warc, guid, timestamp)

uploaded_file.file.seek(0)
write_resource_record_from_asset(uploaded_file.file.read(), warc_url, mime_type, warc)

# create provenance summary and add it to the WARC
provenance = loader.get_template("provenance-summary.html")
context = {"url": url, "now": ts_string}
write_resource_record_from_asset(
provenance.render(context).encode(),
"file:///provenance-summary.html",
"text/html",
warc
)
warc.close()

# set up pages.jsonl...
pages = [
{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"},
{
"id": f"{uuid.uuid4()}",
"url": warc_url,
"title": f"User-uploaded file replacing failed capture of {url}",
"ts": ts_string
},
{
"id": f"{uuid.uuid4()}",
"url": "file:///provenance-summary.html",
"title": "Provenance Summary",
"ts": ts_string
}
]

output = f"{tmpdir}/{guid}.wacz"
pages_jsonl = f"{tmpdir}/pages.jsonl"

# write out pages.jsonl
with open(pages_jsonl, "w") as f:
for page in pages:
f.write(json.dumps(page) + "\n")

# set up py-wacz options
# (I think this is actually an ArgumentParser parser or subparser)...
class Options(object):
def __init__(self, **kwargs):
self.__dict__.update(kwargs)

res = Options(**{
"inputs": [warc_file],
"output": output,
"pages": pages_jsonl,
"extra_pages": None,
"detect_pages": True,
"copy_pages": False,
"desc": f"User upload for {url}",
"hash_type": None,
"url": warc_url,
"ts": None,
"text": False,
"signing_url": None,
"signing_token": None,
"split_seeds": None,
"log_directory": None,
"title": title,
"date": None
})

# create the WACZ, write it to storage...
create_wacz(res)

with open(output, "rb") as f:
storages[settings.WACZ_STORAGE].store_file(f, wacz_destination, overwrite=True)

wacz_size = os.path.getsize(output)

# (no need to clean up, because the context manager will do it)

# ...and return the size
return wacz_size

#
# warc writing
#

def write_perma_warc_header(out_file, guid, timestamp):
# build warcinfo header
Expand Down
1 change: 1 addition & 0 deletions perma_web/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ tempdir # create temp dirs to be deleted at end
ua-parser # user agent parsing to detect mobile browsers during playbacks
warcio # helps us write metadata and inspect our WARCs
warctools # for creating warcs from uploads
wacz>=0.5.0 # for creating waczs from uploads

# alternate storages
django-storages # custom storage backends for Django
Expand Down
Loading

0 comments on commit 844f189

Please sign in to comment.