From fc5b2a6684880c128e2673b2c411bd662b673100 Mon Sep 17 00:00:00 2001 From: Ed Davey Date: Fri, 6 Dec 2019 19:54:56 +0000 Subject: [PATCH] Speed up Export::Submissions::Extract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background It was proposed that the very slow daily `DataWarehouseExport.generate!` could be improved by adding indexes. Back in July 2018 Tekin identified that `Submissions#created_at` should be indexed (https://github.com/dxw/DataSubmissionServiceAPI/issues/23) as it started to be used in `Task#latest_submission` (https://github.com/dxw/DataSubmissionServiceAPI/commit/f46dd46f88e3347995a4504c5dad11130c9dbbd6). However `submissions.created_at` is not used in any of the four 'extractions' which make up the daily data warehouse export. But there are some other unindexed fields involved in extractions, as follows: `Export::Tasks::Extract` -> uses `Task#updated_at` (unindexed) `Export::Submissions::Extract` -> uses `Submission#updated_at` (unindexed) `Export::Invoices::Extract` -> uses `SubmissionEntry#updated_at` (unindexed) `Export::Contracts::Extract` -> uses `SubmissionEntry#updated_at` (unindexed) Accordingly there are some PRs coming through which: - 559: Add index to `submissions.created_at` - 560: Add indexes to `tasks.updated_at`, `submissions.updated_at`, `submission_entries.updated_at` - 561: Increase `maintenance_work_mem` setting for Postgres to allow these indexes to be added This commit improves the speed of one of the 4 extractions: `Export::Submissions::Extract`. It was identified (thanks Russell Garner!) that the subquery joining 'invoices' was a major bottleneck. In local tests, removing the join reduced a sample query from 26s to 0.4s. In order to properly remove this element from the query: - it has been recognised that `entry_count` (the count of submission entries of the type 'invoice') is not needed as users derive this information using other tooling in the 'data warehouse' - the `total_management_charge` projection could be removed, as since https://github.com/dxw/DataSubmissionServiceAPI/commit/d7db3639de6dfb6e4a93b12dfb44ff02ffe2b944 this value has been precomputed and stored on ingestion as `submissions.management_charge_total`. This field was already being returned as part of the top level selection `SELECT submissions.*`. - the `invoice_value` projection (`invoices.total_value`) is not required as again this information is available in the 'data warehouse'. - now that `invoice_entry_count` is no longer available it is now necessary find a new way to determine the `submission_type` ('no_business' or 'file'). It's been agreed that the presence (or not) of `submission_file_type` is sufficient to know if there’s a ‘file’ to be had or whether on the other hand it's a case of ‘no business’. As the structure of the exports (the headings or columns) has changed we've taken the opportunity to describe both: - the expected CSV headers more clearly as a vertical list, and - the values of the example row more closely by ensuring that the expected values as well as being present are also in the expected column. --- app/models/export/submissions.rb | 2 - app/models/export/submissions/extract.rb | 15 +--- app/models/export/submissions/row.rb | 14 +--- spec/factories/submission.rb | 1 + spec/models/export/relation_spec.rb | 80 ++++++++++++++++--- .../models/export/submissions/extract_spec.rb | 20 +---- spec/models/export/submissions/row_spec.rb | 31 +++---- 7 files changed, 90 insertions(+), 73 deletions(-) diff --git a/app/models/export/submissions.rb b/app/models/export/submissions.rb index f91648249..ea75521cd 100644 --- a/app/models/export/submissions.rb +++ b/app/models/export/submissions.rb @@ -10,8 +10,6 @@ class Submissions < ToIO SubmissionFileType ContractEntryCount ContractValue - InvoiceEntryCount - InvoiceValue CCSManagementChargeValue CCSManagementChargeRate CreatedDate diff --git a/app/models/export/submissions/extract.rb b/app/models/export/submissions/extract.rb index 50ffc60b7..3c6e0404f 100644 --- a/app/models/export/submissions/extract.rb +++ b/app/models/export/submissions/extract.rb @@ -13,9 +13,6 @@ def self.all_relevant(date_range = nil) frameworks.short_name AS _framework_short_name, orders.total_value AS _total_order_value, COALESCE(orders.entry_count, 0) AS _order_entry_count, - invoices.total_value AS _total_invoice_value, - invoices.total_management_charge AS _total_management_charge_value, - COALESCE(invoices.entry_count, 0) AS _invoice_entry_count, MIN(blobs.filename) :: text AS _first_filename POSTGRESQL ).joins( @@ -28,14 +25,6 @@ def self.all_relevant(date_range = nil) FROM submission_entries WHERE entry_type = 'order' GROUP BY submission_id) AS orders ON orders.submission_id = submissions.id - LEFT JOIN (SELECT - submission_id, - SUM(total_value) AS total_value, - SUM(management_charge) AS total_management_charge, - COUNT(*) AS entry_count - FROM submission_entries - WHERE entry_type = 'invoice' - GROUP BY submission_id) AS invoices ON invoices.submission_id = submissions.id LEFT JOIN submission_files ON submission_files.submission_id = submissions.id LEFT JOIN active_storage_attachments att ON (att.record_type = 'SubmissionFile' AND att.record_id = submission_files.id) @@ -44,8 +33,8 @@ def self.all_relevant(date_range = nil) ).where( filters ).group( - 'submissions.id, frameworks.short_name, orders.total_value, invoices.total_value,'\ - 'invoices.total_management_charge, orders.entry_count, invoices.entry_count' + 'submissions.id, frameworks.short_name, orders.total_value ,'\ + 'orders.entry_count' ) end end diff --git a/app/models/export/submissions/row.rb b/app/models/export/submissions/row.rb index 5f272885f..7b2c5e6fe 100644 --- a/app/models/export/submissions/row.rb +++ b/app/models/export/submissions/row.rb @@ -12,8 +12,6 @@ def row_values submission_file_type, contract_entry_count, order_value, - invoice_entry_count, - invoice_value, management_charge_value, management_charge_rate, created_date, @@ -34,7 +32,7 @@ def status end def submission_type - (invoice_entry_count + contract_entry_count).zero? ? 'no_business' : 'file' + submission._first_filename.blank? ? 'no_business' : 'file' end def submission_file_type @@ -42,7 +40,7 @@ def submission_file_type end def management_charge_value - format_money(submission._total_management_charge_value) + format_money(submission.management_charge_total) end def management_charge_rate @@ -72,10 +70,6 @@ def supplier_approved_date # Fields that are nil for MVP def finance_export_date; end - def invoice_value - format_money(submission._total_invoice_value) - end - def order_value format_money(submission._total_order_value) end @@ -86,10 +80,6 @@ def contract_entry_count submission._order_entry_count end - def invoice_entry_count - submission._invoice_entry_count - end - def format_money(amount) return '0.00' if amount.nil? diff --git a/spec/factories/submission.rb b/spec/factories/submission.rb index c8d96a1ef..c7c1b8fe2 100644 --- a/spec/factories/submission.rb +++ b/spec/factories/submission.rb @@ -26,6 +26,7 @@ factory :submission_with_validated_entries do aasm_state :in_review + management_charge_total 0.2 after(:create) do |submission, _evaluator| create_list(:invoice_entry, 2, :valid, submission: submission, total_value: 10.00, management_charge: 0.1) create_list(:order_entry, 1, :valid, submission: submission, total_value: 3.00) diff --git a/spec/models/export/relation_spec.rb b/spec/models/export/relation_spec.rb index 07e83bb8a..c0247e5a2 100644 --- a/spec/models/export/relation_spec.rb +++ b/spec/models/export/relation_spec.rb @@ -69,11 +69,12 @@ submitted_by: create(:user, name: 'SubForename SubSurname'), submitted_at: Time.zone.local(2018, 9, 20, 10, 11, 12), purchase_order_number: 'PO1234', + management_charge_total: 1.7912, files: [ create(:submission_file, :with_attachment) ], entries: [ - create(:invoice_entry, total_value: 179.12, management_charge: 1.7912), + create(:invoice_entry, total_value: 179.12), create(:order_entry, total_value: 804.00) ] ) @@ -91,19 +92,78 @@ end it 'writes a header for the submissions export' do - expect(output_lines.first).to eql( - 'TaskID,SubmissionID,Status,SubmissionType,SubmissionFileType,ContractEntryCount,' \ - 'ContractValue,InvoiceEntryCount,InvoiceValue,CCSManagementChargeValue,CCSManagementChargeRate,' \ - 'CreatedDate,CreatedBy,SupplierApprovedDate,SupplierApprovedBy,FinanceExportDate,PONumber' - ) + expect(output_lines.first).to eql(%w[ + TaskID + SubmissionID + Status + SubmissionType + SubmissionFileType + ContractEntryCount + ContractValue + CCSManagementChargeValue + CCSManagementChargeRate + CreatedDate + CreatedBy + SupplierApprovedDate + SupplierApprovedBy + FinanceExportDate + PONumber + ].join(',')) end it 'writes each submission to the export' do expect(output_lines.length).to eql(3) - expect(output_lines.find { |line| line.match('PO1234') }).to eql( - "#{submission.task.id},#{submission.id},supplier_accepted,file,xls,1,804.00,1,179.12,1.79," \ - '#REMOVED,2018-09-18T14:20:35Z,CrForename CrSurname,2018-09-20T10:11:12Z,SubForename SubSurname,,PO1234' - ) + + submission_record = CSV.parse(output_lines.join("\n"), headers: true).find do |row| + row['SubmissionType'] == 'file' + end + + aggregate_failures do + expect(submission_record.fetch('TaskID')) + .to eq(submission.task.id) + + expect(submission_record.fetch('SubmissionID')) + .to eq(submission.id) + + expect(submission_record.fetch('Status')) + .to eq('supplier_accepted') + + expect(submission_record.fetch('SubmissionType')) + .to eq('file') + + expect(submission_record.fetch('SubmissionFileType')) + .to eq('xls') + + expect(submission_record.fetch('ContractEntryCount')) + .to eq('1') + + expect(submission_record.fetch('ContractValue')) + .to eq('804.00') + + expect(submission_record.fetch('CCSManagementChargeValue')) + .to eq('1.79') + + expect(submission_record.fetch('CCSManagementChargeRate')) + .to eq('#REMOVED') + + expect(submission_record.fetch('CreatedDate')) + .to eq('2018-09-18T14:20:35Z') + + expect(submission_record.fetch('CreatedBy')) + .to eq('CrForename CrSurname') + + expect(submission_record.fetch('SupplierApprovedDate')) + .to eq('2018-09-20T10:11:12Z') + + expect(submission_record.fetch('SupplierApprovedBy')) + .to eq('SubForename SubSurname') + + expect(submission_record.fetch('FinanceExportDate')) + .to be_nil + + expect(submission_record.fetch('PONumber')) + .to eq('PO1234') + end end it 'has as many headers as row values' do diff --git a/spec/models/export/submissions/extract_spec.rb b/spec/models/export/submissions/extract_spec.rb index 133cb71af..d6269578d 100644 --- a/spec/models/export/submissions/extract_spec.rb +++ b/spec/models/export/submissions/extract_spec.rb @@ -44,24 +44,10 @@ end end - describe '#_invoice_entry_count as a projection on the Submission model' do - it 'is 0 on the no-business and 2 on the file' do - expect(extract_no_business_submission._invoice_entry_count).to be_zero - expect(extract_file_submission._invoice_entry_count).to eql(2) - end - end - - describe '#_total_invoice_value as a projection on the Submission model' do - it 'is nil on the no-business and 20.0 on the file' do - expect(extract_no_business_submission._total_invoice_value).to be_nil - expect(extract_file_submission._total_invoice_value.to_digits).to eql('20.0') - end - end - - describe '#_total_management_charge_value as a projection on the Submission model' do + describe '#management_charge_total as an attribute of the Submission model' do it 'is nil on the no-business and 20.0 on the file' do - expect(extract_no_business_submission._total_invoice_value).to be_nil - expect(extract_file_submission._total_management_charge_value.to_digits).to eql('0.2') + expect(extract_no_business_submission.management_charge_total).to be_nil + expect(extract_file_submission.management_charge_total.to_digits).to eql('0.2') end end diff --git a/spec/models/export/submissions/row_spec.rb b/spec/models/export/submissions/row_spec.rb index e3025f88a..9eb6630cc 100644 --- a/spec/models/export/submissions/row_spec.rb +++ b/spec/models/export/submissions/row_spec.rb @@ -32,28 +32,21 @@ end describe '#submission_type and its dependence on _ projected fields' do - before do - allow(row).to receive(:invoice_entry_count).and_return(invoices) - allow(row).to receive(:contract_entry_count).and_return(orders) - end - subject { row.submission_type } - context 'there are no invoices or order entries' do - let(:invoices) { 0 } - let(:orders) { 0 } + context 'when no file has been submitted' do + before do + allow(submission).to receive(:_first_filename).and_return(nil) + end + it { is_expected.to eql('no_business') } end - context 'there is at least one invoice entry' do - let(:invoices) { 1 } - let(:orders) { 0 } - it { is_expected.to eql('file') } - end + context 'when a file has been submitted' do + before do + allow(submission).to receive(:_first_filename).and_return('upload.xls') + end - context 'there is at least one order entry' do - let(:invoices) { 0 } - let(:orders) { 1 } it { is_expected.to eql('file') } end end @@ -72,10 +65,10 @@ end end - describe '#management_charge_value' do - let(:submission) { double 'Submission', _total_management_charge_value: 13.23 } + describe '#management_charge_total' do + let(:submission) { double 'Submission', management_charge_total: 13.23 } - it 'presents the charge identified by the _total_management_charge_value projected field' do + it 'presents the charge identified by the management_charge_total attr' do expect(row.management_charge_value).to eq '13.23' end end