Skip to content

Commit

Permalink
Merge branch 'iss_562_refactor_book_creation' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
jzohrab committed Jan 9, 2025
2 parents 3a949d7 + 1e5cb8d commit 4e168d2
Show file tree
Hide file tree
Showing 17 changed files with 436 additions and 371 deletions.
52 changes: 22 additions & 30 deletions lute/book/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,14 @@
from wtforms.validators import DataRequired, Length, NumberRange
from flask_wtf import FlaskForm
from flask_wtf.file import FileField, FileAllowed
from lute.book.service import Service


def _tag_values(field_data):
"Convert field data to array."
ret = []
if field_data:
ret = [h["value"] for h in json.loads(field_data)]
return ret


class NewBookForm(FlaskForm):
Expand Down Expand Up @@ -68,22 +75,15 @@ def _data(arr):
def populate_obj(self, obj):
"Call the populate_obj method from the parent class, then mine."
super().populate_obj(obj)

def _values(field_data):
"Convert field data to array."
ret = []
if field_data:
ret = [h["value"] for h in json.loads(field_data)]
return ret

obj.book_tags = _values(self.book_tags.data)

service = Service()
if self.textfile.data:
obj.text = service.get_file_content(self.textfile.data)
f = self.audiofile.data
if f:
obj.audio_filename = service.save_audio_file(f)
obj.book_tags = _tag_values(self.book_tags.data)
tfd = self.textfile.data
if tfd:
obj.text_stream = tfd.stream
obj.text_stream_filename = tfd.filename
afd = self.audiofile.data
if afd:
obj.audio_stream = afd.stream
obj.audio_stream_filename = afd.filename

def validate_language_id(self, field): # pylint: disable=unused-argument
"Language must be set."
Expand Down Expand Up @@ -139,19 +139,11 @@ def _data(arr):
def populate_obj(self, obj):
"Call the populate_obj method from the parent class, then mine."
super().populate_obj(obj)
obj.book_tags = _tag_values(self.book_tags.data)

def _values(field_data):
"Convert field data to array."
ret = []
if field_data:
ret = [h["value"] for h in json.loads(field_data)]
return ret

obj.book_tags = _values(self.book_tags.data)

f = self.audiofile.data
service = Service()
if f:
obj.audio_filename = service.save_audio_file(f)
afd = self.audiofile.data
if afd:
obj.audio_stream = afd.stream
obj.audio_stream_filename = afd.filename
obj.audio_bookmarks = None
obj.audio_current_pos = None
108 changes: 106 additions & 2 deletions lute/book/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,67 @@
Book domain objects.
"""

from lute.models.book import Book as DBBook, BookTag
from lute.models.book import BookTag, Book as DBBook, Text as DBText
from lute.models.repositories import (
BookRepository,
BookTagRepository,
LanguageRepository,
)


class SentenceGroupIterator:
"""
An iterator of ParsedTokens that groups them by sentence, up
to a maximum number of tokens.
"""

def __init__(self, tokens, maxcount=500):
self.tokens = tokens
self.maxcount = maxcount
self.currpos = 0

def count(self):
"""
Get count of groups that will be returned.
"""
old_currpos = self.currpos
c = 0
while self.next():
c += 1
self.currpos = old_currpos
return c

def next(self):
"""
Get next sentence group.
"""
if self.currpos >= len(self.tokens):
return False

curr_tok_count = 0
last_eos = -1
i = self.currpos

while (curr_tok_count <= self.maxcount or last_eos == -1) and i < len(
self.tokens
):
tok = self.tokens[i]
if tok.is_end_of_sentence == 1:
last_eos = i
if tok.is_word == 1:
curr_tok_count += 1
i += 1

if curr_tok_count <= self.maxcount or last_eos == -1:
ret = self.tokens[self.currpos : i]
self.currpos = i + 1
else:
ret = self.tokens[self.currpos : last_eos + 1]
self.currpos = last_eos + 1

return ret


class Book: # pylint: disable=too-many-instance-attributes
"""
A book domain object, to create/edit lute.models.book.Books.
Expand All @@ -32,6 +85,19 @@ def __init__(self):
self.audio_bookmarks = None
self.book_tags = []

# The source file used for the book text.
# Overrides the self.text if not None.
self.text_source_path = None

self.text_stream = None
self.text_stream_filename = None

# The source file used for audio.
self.audio_source_path = None

self.audio_stream = None
self.audio_stream_filename = None

def __repr__(self):
return f"<Book (id={self.id}, title='{self.title}')>"

Expand Down Expand Up @@ -92,6 +158,40 @@ def commit(self):
"""
self.session.commit()

def _split_text_at_page_breaks(self, txt):
"Break fulltext manually at lines consisting of '---' only."
# Tried doing this with a regex without success.
segments = []
current_segment = ""
for line in txt.split("\n"):
if line.strip() == "---":
segments.append(current_segment.strip())
current_segment = ""
else:
current_segment += line + "\n"
if current_segment:
segments.append(current_segment.strip())
return segments

def _split_by_sentences(self, language, fulltext, max_word_tokens_per_text=250):
"Split fulltext into pages, respecting sentences."

pages = []
for segment in self._split_text_at_page_breaks(fulltext):
tokens = language.parser.get_parsed_tokens(segment, language)
it = SentenceGroupIterator(tokens, max_word_tokens_per_text)
while toks := it.next():
s = (
"".join([t.token for t in toks])
.replace("\r", "")
.replace("¶", "\n")
.strip()
)
pages.append(s)
pages = [p for p in pages if p.strip() != ""]

return pages

def _build_db_book(self, book):
"Convert a book business object to a DBBook."

Expand All @@ -107,9 +207,13 @@ def _build_db_book(self, book):

b = None
if book.id is None:
b = DBBook.create_book(book.title, lang, book.text, book.max_page_tokens)
pages = self._split_by_sentences(lang, book.text, book.max_page_tokens)
b = DBBook(book.title, lang)
for index, page in enumerate(pages):
_ = DBText(b, page, index + 1)
else:
b = self.book_repo.find(book.id)

b.title = book.title
b.source_uri = book.source_uri
b.audio_filename = book.audio_filename
Expand Down
29 changes: 18 additions & 11 deletions lute/book/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@
flash,
)
from lute.utils.data_tables import DataTablesFlaskParamParser
from lute.book.service import Service, BookImportException
from lute.book.service import (
Service as BookService,
BookImportException,
BookDataFromUrl,
)
from lute.book.datatables import get_data_tables_list
from lute.book.forms import NewBookForm, EditBookForm
from lute.book.stats import Service as StatsService
import lute.utils.formutils
from lute.db import db

from lute.models.language import Language
from lute.models.repositories import (
BookRepository,
Expand Down Expand Up @@ -83,14 +86,18 @@ def datatables_archived_source():


def _book_from_url(url):
"Create a new book, or flash an error if can't parse."
b = Book()
service = Service()
"Get data for a new book, or flash an error if can't parse."
service = BookService()
bd = None
try:
b = service.book_from_url(url)
bd = service.book_data_from_url(url)
except BookImportException as e:
flash(e.message, "notice")
b = Book()
bd = BookDataFromUrl()
b = Book()
b.title = bd.title
b.source_uri = bd.source_uri
b.text = bd.text
return b


Expand Down Expand Up @@ -119,8 +126,8 @@ def new():
if form.validate_on_submit():
try:
form.populate_obj(b)
book = repo.add(b)
repo.commit()
svc = BookService()
book = svc.import_book(b, db.session)
return redirect(f"/read/{book.id}/page/1", 302)
except BookImportException as e:
flash(e.message, "notice")
Expand Down Expand Up @@ -149,8 +156,8 @@ def edit(bookid):

if form.validate_on_submit():
form.populate_obj(b)
repo.add(b)
repo.commit()
svc = BookService()
svc.import_book(b, db.session)
flash(f"{b.title} updated.")
return redirect("/", 302)

Expand Down
Loading

0 comments on commit 4e168d2

Please sign in to comment.