Skip to content

Commit

Permalink
Merge pull request #225 from edelgm6/textract
Browse files Browse the repository at this point in the history
Textract integration
  • Loading branch information
edelgm6 authored Jul 24, 2024
2 parents 4b00905 + db0627d commit 039125a
Show file tree
Hide file tree
Showing 36 changed files with 1,052 additions and 74 deletions.
13 changes: 11 additions & 2 deletions api/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .models import (
PrefillItem, Prefill, Amortization, TaxCharge, Account, Transaction,
JournalEntry, JournalEntryItem, AutoTag, CSVProfile, Reconciliation,
CSVColumnValuePair
CSVColumnValuePair, S3File, DocSearch, Paystub, PaystubValue
)


Expand Down Expand Up @@ -56,8 +56,13 @@ class PrefillItemInline(admin.TabularInline):
extra = 8


class DocSearchInline(admin.TabularInline):
model = DocSearch
extra = 8


class PrefillAdmin(admin.ModelAdmin):
inlines = [PrefillItemInline]
inlines = [PrefillItemInline, DocSearchInline]
list_display = ('description',)

def description(self, obj):
Expand All @@ -77,3 +82,7 @@ def description(self, obj):
admin.site.register(CSVColumnValuePair)
admin.site.register(Amortization)
admin.site.register(PrefillItem)
admin.site.register(S3File)
admin.site.register(DocSearch)
admin.site.register(Paystub)
admin.site.register(PaystubValue)
127 changes: 127 additions & 0 deletions api/aws_services.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import re
import boto3
import uuid
from django.conf import settings
from decimal import Decimal

def get_boto3_client(service='textract'):
client = boto3.client(
service,
aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
region_name=settings.AWS_REGION_NAME
)
return client

def upload_file_to_s3(file):
s3_client = get_boto3_client(service='s3')
unique_name = generate_unique_filename(file)
try:
s3_client.upload_fileobj(
file,
settings.AWS_STORAGE_BUCKET_NAME,
unique_name,
ExtraArgs={'ContentType': file.content_type}
)
return unique_name
except Exception as e:
print(str(e))
return {'error': str(e), 'message': 'Upload failed'}

def generate_unique_filename(file):
ext = file.name.split('.')[-1]
unique_filename = f"{uuid.uuid4()}.{ext}"
return unique_filename

def create_textract_job(filename):
# Boto3 client for Textract
client = get_boto3_client()

# Process file
response = client.start_document_analysis(
DocumentLocation={
'S3Object': {
'Bucket': settings.AWS_STORAGE_BUCKET_NAME,
'Name': filename
}
},
FeatureTypes=[
'FORMS','TABLES'
]
)
job_id = response.get('JobId')
return job_id

# Get all responses, paginated
def get_textract_results(job_id):
client = get_boto3_client()

responses = []
next_token = None
while True:
if next_token:
response = client.get_document_analysis(JobId=job_id, NextToken=next_token)
else:
response = client.get_document_analysis(JobId=job_id)

responses.append(response)
next_token = response.get('NextToken')
if not next_token:
break

combined_response = combine_responses(responses)
return combined_response

def combine_responses(responses):
combined_response = {
"DocumentMetadata": {
"Pages": ""
},
"Blocks": []
}

for response in responses:
try:
combined_response["DocumentMetadata"]["Pages"] = response["DocumentMetadata"]["Pages"]
except KeyError:
pass
combined_response["Blocks"].extend(response["Blocks"])

return combined_response

def convert_table_to_cleaned_dataframe(table):
no_titles_table = table.strip_headers(column_headers=False, in_table_title=True, section_titles=True)

pandas_table = no_titles_table.to_pandas()

# Set the first row as the header
pandas_table.columns = pandas_table.iloc[0]
pandas_table = pandas_table[1:]

# Set the first column as the index
pandas_table.set_index(pandas_table.columns[0], inplace=True)

# Strip whitespace from column names and index
pandas_table.columns = pandas_table.columns.str.strip()
pandas_table.index = pandas_table.index.str.strip()

return pandas_table

def clean_string(input_string):
if input_string is None:
return None
# Remove commas
cleaned_string = input_string.replace(',', '')

# Remove starting/trailing whitespace and ensure only one space between words
cleaned_string = ' '.join(cleaned_string.split())

return cleaned_string

def clean_and_convert_string_to_decimal(input_string):
if not input_string:
return Decimal('0.00')
cleaned_string = clean_string(input_string)
cleaned_string = cleaned_string.replace(',', '').replace('$', '')
cleaned_string = re.sub(r'[^\d.]', '', cleaned_string)
return Decimal(cleaned_string).quantize(Decimal('0.00'))
3 changes: 2 additions & 1 deletion api/factories.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from api.models import Reconciliation, Account, TaxCharge
from api.models import Reconciliation, Account, TaxCharge, Paystub, PaystubValue


class ReconciliationFactory:
Expand Down Expand Up @@ -46,3 +46,4 @@ def create_bulk_tax_charges(date):
]
if len(existing_tax_charge) == 0:
TaxCharge.objects.create(date=date, type=value, amount=0)

24 changes: 22 additions & 2 deletions api/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,34 @@
from django.utils import timezone
from django.core.validators import MinValueValidator
from django.core.exceptions import ValidationError
from django.conf import settings
from api.models import (
Amortization, Transaction, Account, JournalEntryItem,
TaxCharge, Reconciliation, JournalEntry
TaxCharge, Reconciliation, JournalEntry, S3File, Prefill
)

from api import utils
from api.factories import ReconciliationFactory
from api.aws_services import upload_file_to_s3


class DocumentForm(forms.Form):
document = forms.FileField()
prefill = forms.ModelChoiceField(
queryset=Prefill.objects.filter(docsearch__isnull=False).distinct(),
required=True
)

def create_s3_file(self):
file = self.cleaned_data['document']
unique_name = upload_file_to_s3(file=file)
file_url = f"https://{settings.AWS_STORAGE_BUCKET_NAME}.s3.amazonaws.com/{unique_name}"
s3file = S3File.objects.create(
prefill=self.cleaned_data['prefill'],
url=file_url,
user_filename=file.name,
s3_filename=unique_name
)
return s3file

class CommaDecimalField(DecimalField):
def to_python(self, value):
Expand Down
22 changes: 22 additions & 0 deletions api/migrations/0069_s3file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 4.1.6 on 2024-07-12 15:59

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0068_journalentryitem_jei_date_idx_and_more'),
]

operations = [
migrations.CreateModel(
name='S3File',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('url', models.URLField(unique=True)),
('user_filename', models.CharField(max_length=200)),
('s3_filename', models.CharField(max_length=200)),
],
),
]
18 changes: 18 additions & 0 deletions api/migrations/0070_s3file_textract_job_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.1.6 on 2024-07-12 18:46

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0069_s3file'),
]

operations = [
migrations.AddField(
model_name='s3file',
name='textract_job_id',
field=models.CharField(blank=True, max_length=200, null=True),
),
]
22 changes: 22 additions & 0 deletions api/migrations/0071_docsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 4.1.6 on 2024-07-15 15:58

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0070_s3file_textract_job_id'),
]

operations = [
migrations.CreateModel(
name='DocSearch',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('keyword', models.CharField(blank=True, max_length=200, null=True)),
('row', models.CharField(blank=True, max_length=200, null=True)),
('column', models.CharField(blank=True, max_length=200, null=True)),
],
),
]
18 changes: 18 additions & 0 deletions api/migrations/0072_docsearch_table_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.1.6 on 2024-07-15 16:15

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0071_docsearch'),
]

operations = [
migrations.AddField(
model_name='docsearch',
name='table_name',
field=models.CharField(blank=True, max_length=200, null=True),
),
]
24 changes: 24 additions & 0 deletions api/migrations/0073_docsearch_account_docsearch_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 4.1.6 on 2024-07-15 19:22

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('api', '0072_docsearch_table_name'),
]

operations = [
migrations.AddField(
model_name='docsearch',
name='account',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.account'),
),
migrations.AddField(
model_name='docsearch',
name='selection',
field=models.CharField(blank=True, choices=[('Company', 'Company'), ('Begin Period', 'Begin Period'), ('End Period', 'End Period')], max_length=20, null=True),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Generated by Django 4.1.6 on 2024-07-16 14:43

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('api', '0073_docsearch_account_docsearch_selection'),
]

operations = [
migrations.CreateModel(
name='Paystub',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('page_id', models.CharField(max_length=200)),
('title', models.CharField(max_length=200)),
],
),
migrations.AddField(
model_name='docsearch',
name='prefill',
field=models.OneToOneField(default=1, on_delete=django.db.models.deletion.PROTECT, to='api.prefill'),
preserve_default=False,
),
migrations.AddField(
model_name='s3file',
name='prefill',
field=models.OneToOneField(default=1, on_delete=django.db.models.deletion.PROTECT, to='api.prefill'),
preserve_default=False,
),
migrations.CreateModel(
name='PaystubValue',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('amount', models.DecimalField(decimal_places=2, max_digits=12)),
('account', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='api.account')),
('paystub', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='api.paystub')),
],
),
migrations.AddField(
model_name='paystub',
name='document',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='api.s3file'),
),
migrations.AddField(
model_name='paystub',
name='journal_entry',
field=models.OneToOneField(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.prefill'),
),
]
19 changes: 19 additions & 0 deletions api/migrations/0075_alter_docsearch_prefill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 4.1.6 on 2024-07-16 14:47

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('api', '0074_paystub_docsearch_prefill_s3file_prefill_and_more'),
]

operations = [
migrations.AlterField(
model_name='docsearch',
name='prefill',
field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='api.prefill'),
),
]
Loading

0 comments on commit 039125a

Please sign in to comment.