-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #225 from edelgm6/textract
Textract integration
- Loading branch information
Showing
36 changed files
with
1,052 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
import re | ||
import boto3 | ||
import uuid | ||
from django.conf import settings | ||
from decimal import Decimal | ||
|
||
def get_boto3_client(service='textract'): | ||
client = boto3.client( | ||
service, | ||
aws_access_key_id=settings.AWS_ACCESS_KEY_ID, | ||
aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, | ||
region_name=settings.AWS_REGION_NAME | ||
) | ||
return client | ||
|
||
def upload_file_to_s3(file): | ||
s3_client = get_boto3_client(service='s3') | ||
unique_name = generate_unique_filename(file) | ||
try: | ||
s3_client.upload_fileobj( | ||
file, | ||
settings.AWS_STORAGE_BUCKET_NAME, | ||
unique_name, | ||
ExtraArgs={'ContentType': file.content_type} | ||
) | ||
return unique_name | ||
except Exception as e: | ||
print(str(e)) | ||
return {'error': str(e), 'message': 'Upload failed'} | ||
|
||
def generate_unique_filename(file): | ||
ext = file.name.split('.')[-1] | ||
unique_filename = f"{uuid.uuid4()}.{ext}" | ||
return unique_filename | ||
|
||
def create_textract_job(filename): | ||
# Boto3 client for Textract | ||
client = get_boto3_client() | ||
|
||
# Process file | ||
response = client.start_document_analysis( | ||
DocumentLocation={ | ||
'S3Object': { | ||
'Bucket': settings.AWS_STORAGE_BUCKET_NAME, | ||
'Name': filename | ||
} | ||
}, | ||
FeatureTypes=[ | ||
'FORMS','TABLES' | ||
] | ||
) | ||
job_id = response.get('JobId') | ||
return job_id | ||
|
||
# Get all responses, paginated | ||
def get_textract_results(job_id): | ||
client = get_boto3_client() | ||
|
||
responses = [] | ||
next_token = None | ||
while True: | ||
if next_token: | ||
response = client.get_document_analysis(JobId=job_id, NextToken=next_token) | ||
else: | ||
response = client.get_document_analysis(JobId=job_id) | ||
|
||
responses.append(response) | ||
next_token = response.get('NextToken') | ||
if not next_token: | ||
break | ||
|
||
combined_response = combine_responses(responses) | ||
return combined_response | ||
|
||
def combine_responses(responses): | ||
combined_response = { | ||
"DocumentMetadata": { | ||
"Pages": "" | ||
}, | ||
"Blocks": [] | ||
} | ||
|
||
for response in responses: | ||
try: | ||
combined_response["DocumentMetadata"]["Pages"] = response["DocumentMetadata"]["Pages"] | ||
except KeyError: | ||
pass | ||
combined_response["Blocks"].extend(response["Blocks"]) | ||
|
||
return combined_response | ||
|
||
def convert_table_to_cleaned_dataframe(table): | ||
no_titles_table = table.strip_headers(column_headers=False, in_table_title=True, section_titles=True) | ||
|
||
pandas_table = no_titles_table.to_pandas() | ||
|
||
# Set the first row as the header | ||
pandas_table.columns = pandas_table.iloc[0] | ||
pandas_table = pandas_table[1:] | ||
|
||
# Set the first column as the index | ||
pandas_table.set_index(pandas_table.columns[0], inplace=True) | ||
|
||
# Strip whitespace from column names and index | ||
pandas_table.columns = pandas_table.columns.str.strip() | ||
pandas_table.index = pandas_table.index.str.strip() | ||
|
||
return pandas_table | ||
|
||
def clean_string(input_string): | ||
if input_string is None: | ||
return None | ||
# Remove commas | ||
cleaned_string = input_string.replace(',', '') | ||
|
||
# Remove starting/trailing whitespace and ensure only one space between words | ||
cleaned_string = ' '.join(cleaned_string.split()) | ||
|
||
return cleaned_string | ||
|
||
def clean_and_convert_string_to_decimal(input_string): | ||
if not input_string: | ||
return Decimal('0.00') | ||
cleaned_string = clean_string(input_string) | ||
cleaned_string = cleaned_string.replace(',', '').replace('$', '') | ||
cleaned_string = re.sub(r'[^\d.]', '', cleaned_string) | ||
return Decimal(cleaned_string).quantize(Decimal('0.00')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Generated by Django 4.1.6 on 2024-07-12 15:59 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('api', '0068_journalentryitem_jei_date_idx_and_more'), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name='S3File', | ||
fields=[ | ||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('url', models.URLField(unique=True)), | ||
('user_filename', models.CharField(max_length=200)), | ||
('s3_filename', models.CharField(max_length=200)), | ||
], | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Generated by Django 4.1.6 on 2024-07-12 18:46 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('api', '0069_s3file'), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name='s3file', | ||
name='textract_job_id', | ||
field=models.CharField(blank=True, max_length=200, null=True), | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Generated by Django 4.1.6 on 2024-07-15 15:58 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('api', '0070_s3file_textract_job_id'), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name='DocSearch', | ||
fields=[ | ||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('keyword', models.CharField(blank=True, max_length=200, null=True)), | ||
('row', models.CharField(blank=True, max_length=200, null=True)), | ||
('column', models.CharField(blank=True, max_length=200, null=True)), | ||
], | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Generated by Django 4.1.6 on 2024-07-15 16:15 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('api', '0071_docsearch'), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name='docsearch', | ||
name='table_name', | ||
field=models.CharField(blank=True, max_length=200, null=True), | ||
), | ||
] |
24 changes: 24 additions & 0 deletions
24
api/migrations/0073_docsearch_account_docsearch_selection.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Generated by Django 4.1.6 on 2024-07-15 19:22 | ||
|
||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('api', '0072_docsearch_table_name'), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name='docsearch', | ||
name='account', | ||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.account'), | ||
), | ||
migrations.AddField( | ||
model_name='docsearch', | ||
name='selection', | ||
field=models.CharField(blank=True, choices=[('Company', 'Company'), ('Begin Period', 'Begin Period'), ('End Period', 'End Period')], max_length=20, null=True), | ||
), | ||
] |
53 changes: 53 additions & 0 deletions
53
api/migrations/0074_paystub_docsearch_prefill_s3file_prefill_and_more.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# Generated by Django 4.1.6 on 2024-07-16 14:43 | ||
|
||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('api', '0073_docsearch_account_docsearch_selection'), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name='Paystub', | ||
fields=[ | ||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('page_id', models.CharField(max_length=200)), | ||
('title', models.CharField(max_length=200)), | ||
], | ||
), | ||
migrations.AddField( | ||
model_name='docsearch', | ||
name='prefill', | ||
field=models.OneToOneField(default=1, on_delete=django.db.models.deletion.PROTECT, to='api.prefill'), | ||
preserve_default=False, | ||
), | ||
migrations.AddField( | ||
model_name='s3file', | ||
name='prefill', | ||
field=models.OneToOneField(default=1, on_delete=django.db.models.deletion.PROTECT, to='api.prefill'), | ||
preserve_default=False, | ||
), | ||
migrations.CreateModel( | ||
name='PaystubValue', | ||
fields=[ | ||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('amount', models.DecimalField(decimal_places=2, max_digits=12)), | ||
('account', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='api.account')), | ||
('paystub', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='api.paystub')), | ||
], | ||
), | ||
migrations.AddField( | ||
model_name='paystub', | ||
name='document', | ||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='api.s3file'), | ||
), | ||
migrations.AddField( | ||
model_name='paystub', | ||
name='journal_entry', | ||
field=models.OneToOneField(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.prefill'), | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Generated by Django 4.1.6 on 2024-07-16 14:47 | ||
|
||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('api', '0074_paystub_docsearch_prefill_s3file_prefill_and_more'), | ||
] | ||
|
||
operations = [ | ||
migrations.AlterField( | ||
model_name='docsearch', | ||
name='prefill', | ||
field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='api.prefill'), | ||
), | ||
] |
Oops, something went wrong.