From 21d96c9e70917bece57efb045cf3d5f578b6b71c Mon Sep 17 00:00:00 2001 From: Mikko Kortelainen Date: Mon, 2 Sep 2024 14:08:37 +0300 Subject: [PATCH] Spotless (#300) * add spotless * apply spotless * update license check, fix license in evalTest --- eclipse-java-formatter.xml | 450 +++++ license-header | 45 + pom.xml | 184 +- .../pth10/ast/DPLAuditInformation.java | 17 +- .../pth10/ast/DPLInternalStreamingQuery.java | 8 +- .../DPLInternalStreamingQueryListener.java | 74 +- .../pth10/ast/DPLParserCatalystContext.java | 46 +- .../pth10/ast/DPLParserCatalystVisitor.java | 101 +- .../teragrep/pth10/ast/DPLParserConfig.java | 84 +- .../com/teragrep/pth10/ast/DPLTimeFormat.java | 74 +- .../teragrep/pth10/ast/DefaultTimeFormat.java | 29 +- .../com/teragrep/pth10/ast/MapTypeColumn.java | 45 +- .../com/teragrep/pth10/ast/NullValue.java | 28 +- .../com/teragrep/pth10/ast/NumericText.java | 12 +- .../com/teragrep/pth10/ast/PrettyTree.java | 17 +- .../java/com/teragrep/pth10/ast/StepList.java | 87 +- .../java/com/teragrep/pth10/ast/Text.java | 11 +- .../com/teragrep/pth10/ast/TextString.java | 10 +- .../com/teragrep/pth10/ast/TimeRange.java | 12 +- .../com/teragrep/pth10/ast/UnquotedText.java | 11 +- .../teragrep/pth10/ast/bo/CatalystNode.java | 73 +- .../com/teragrep/pth10/ast/bo/ColumnNode.java | 63 +- .../teragrep/pth10/ast/bo/ElementNode.java | 90 +- .../com/teragrep/pth10/ast/bo/ListNode.java | 21 +- .../java/com/teragrep/pth10/ast/bo/Node.java | 191 +- .../com/teragrep/pth10/ast/bo/NullNode.java | 7 +- .../teragrep/pth10/ast/bo/StepListNode.java | 12 +- .../com/teragrep/pth10/ast/bo/StepNode.java | 7 +- .../teragrep/pth10/ast/bo/StringListNode.java | 54 +- .../com/teragrep/pth10/ast/bo/StringNode.java | 14 +- .../teragrep/pth10/ast/bo/SubSearchNode.java | 173 +- .../java/com/teragrep/pth10/ast/bo/Token.java | 98 +- .../pth10/ast/bo/TranslationResultNode.java | 7 +- .../teragrep/pth10/ast/commands/EmitMode.java | 22 +- .../commands/aggregate/AggregateFunction.java | 274 +-- .../aggregate/UDAFs/AggregatorMode.java | 58 +- .../UDAFs/BufferClasses/CountBuffer.java | 93 +- .../UDAFs/BufferClasses/ListBuffer.java | 121 +- .../UDAFs/BufferClasses/MapBuffer.java | 71 +- .../UDAFs/BufferClasses/MinMaxBuffer.java | 31 +- .../UDAFs/BufferClasses/ModeBuffer.java | 108 +- .../UDAFs/BufferClasses/PercentileBuffer.java | 178 +- .../UDAFs/BufferClasses/SumBuffer.java | 7 +- .../BufferClasses/TimestampMapBuffer.java | 319 ++-- .../UDAFs/BufferClasses/ValuesBuffer.java | 98 +- .../aggregate/UDAFs/CountAggregator.java | 9 +- .../UDAFs/DistinctCountAggregator.java | 113 +- .../UDAFs/EarliestLatestAggregator.java | 179 +- .../EarliestLatestAggregator_Double.java | 80 +- .../EarliestLatestAggregator_String.java | 98 +- .../UDAFs/ExactPercentileAggregator.java | 190 +- .../UDAFs/FieldIndex/FieldIndex.java | 7 +- .../UDAFs/FieldIndex/FieldIndexImpl.java | 8 +- .../UDAFs/FieldIndex/FieldIndexStub.java | 8 +- .../aggregate/UDAFs/MinMaxAggregator.java | 57 +- .../aggregate/UDAFs/ModeAggregator.java | 151 +- .../aggregate/UDAFs/SumAggregator.java | 7 +- .../aggregate/UDAFs/UDAF_DistinctCount.java | 301 ++-- .../aggregate/UDAFs/ValuesAggregator.java | 238 +-- .../aggregate/utils/PercentileApprox.java | 62 +- .../commands/evalstatement/EvalStatement.java | 889 +++++----- .../evalstatement/UDFs/Cidrmatch.java | 173 +- .../commands/evalstatement/UDFs/Commands.java | 45 +- .../evalstatement/UDFs/EvalArithmetic.java | 24 +- .../evalstatement/UDFs/EvalOperation.java | 43 +- .../commands/evalstatement/UDFs/IfClause.java | 15 +- .../UDFs/InverseHyperbolicFunction.java | 136 +- .../commands/evalstatement/UDFs/IsType.java | 56 +- .../evalstatement/UDFs/JSONValid.java | 41 +- .../evalstatement/UDFs/LikeComparison.java | 12 +- .../commands/evalstatement/UDFs/MinMax.java | 39 +- .../commands/evalstatement/UDFs/Mvdedup.java | 40 +- .../commands/evalstatement/UDFs/Mvindex.java | 75 +- .../commands/evalstatement/UDFs/Mvjoin.java | 50 +- .../commands/evalstatement/UDFs/Mvrange.java | 124 +- .../commands/evalstatement/UDFs/Mvzip.java | 54 +- .../evalstatement/UDFs/RandomNumber.java | 30 +- .../evalstatement/UDFs/RegexMatch.java | 208 +-- .../evalstatement/UDFs/Relative_time.java | 25 +- .../commands/evalstatement/UDFs/Sigfig.java | 254 +-- .../commands/evalstatement/UDFs/Spath.java | 368 ++-- .../evalstatement/UDFs/TimeToUnixTime.java | 23 +- .../UDFs/TimestampArithmetic.java | 28 +- .../commands/evalstatement/UDFs/Tonumber.java | 64 +- .../commands/evalstatement/UDFs/TypeOf.java | 56 +- .../evalstatement/UDFs/UrlDecode.java | 41 +- .../LogicalStatementCatalyst.java | 132 +- .../logicalstatement/LogicalStatementXML.java | 100 +- .../logicalstatement/TimeStatement.java | 52 +- .../UDFs/SearchComparison.java | 26 +- .../AddtotalsTransformation.java | 21 +- .../ChartTransformation.java | 94 +- .../ConvertTransformation.java | 21 +- .../transformstatement/DPLTransformation.java | 15 +- .../DedupTransformation.java | 47 +- .../EvalTransformation.java | 27 +- .../EventstatsTransformation.java | 11 +- .../ExplainTransformation.java | 17 +- .../FieldsTransformation.java | 44 +- .../FillnullTransformation.java | 9 +- .../FormatTransformation.java | 25 +- .../IplocationTransformation.java | 15 +- .../JoinTransformation.java | 571 +++--- .../MakeresultsTransformation.java | 26 +- .../PredictTransformation.java | 34 +- .../RangemapTransformation.java | 18 +- .../RegexTransformation.java | 14 +- .../RenameTransformation.java | 12 +- .../ReplaceTransformation.java | 10 +- .../transformstatement/RexTransformation.java | 16 +- .../SearchTransformation.java | 18 +- .../SendemailTransformation.java | 1575 +++++++++-------- .../SortTransformation.java | 36 +- .../SpathTransformation.java | 23 +- .../StatsTransformation.java | 301 ++-- .../StrcatTransformation.java | 185 +- .../TableTransformation.java | 11 +- .../TeragrepTransformation.java | 149 +- .../TimechartTransformation.java | 268 ++- .../transformstatement/TopTransformation.java | 26 +- .../TransformStatement.java | 46 +- .../WhereTransformation.java | 28 +- .../XmlkvTransformation.java | 10 +- .../accum/AccumTransformation.java | 95 +- .../accum/AccumulatedSum.java | 52 +- .../accum/BatchCollector.java | 162 +- .../addtotals/AddtotalsUDF.java | 7 +- .../transformstatement/convert/Auto.java | 9 +- .../transformstatement/convert/Ctime.java | 33 +- .../transformstatement/convert/Dur2Sec.java | 51 +- .../transformstatement/convert/Memk.java | 68 +- .../transformstatement/convert/Mktime.java | 25 +- .../transformstatement/convert/Mstime.java | 62 +- .../transformstatement/convert/Rmunit.java | 370 ++-- .../iplocation/IplocationGeoIPDataMapper.java | 20 +- .../iplocation/IplocationRirDataMapper.java | 25 +- .../iplocation/RirLookupResult.java | 18 +- .../transformstatement/regex/RegexMatch.java | 20 +- .../replace/ReplaceCmd.java | 98 +- .../rex/CheckedSedString.java | 33 +- .../rex/RexExtractModeUDF.java | 7 +- .../transformstatement/rex/RexSedModeUDF.java | 22 +- .../rex4j/NamedGroupsRex.java | 43 +- .../rex4j/Rex4jTransformation.java | 49 +- .../sendemail/DatasetToTextBuilder.java | 376 ++-- .../sendemail/SendemailResultsProcessor.java | 1081 +++++------ .../teragrep/HdfsSaveMetadata.java | 14 +- .../teragrep/SyslogStreamer.java | 48 +- .../transformstatement/xmlkv/XmlkvUDF.java | 23 +- .../pth10/ast/time/RelativeOffset.java | 20 +- .../pth10/ast/time/RelativeTimeParser.java | 26 +- .../pth10/ast/time/RelativeTimestamp.java | 17 +- .../teragrep/pth10/ast/time/SnapToTime.java | 13 +- .../pth10/datasources/ArchiveQuery.java | 22 +- .../pth10/datasources/DPLDatasource.java | 66 +- .../datasources/GeneratedDatasource.java | 137 +- .../pth10/datasources/S3CredentialWallet.java | 12 +- .../teragrep/pth10/steps/AbstractStep.java | 9 +- .../pth10/steps/EmptyDataframeStep.java | 11 +- .../com/teragrep/pth10/steps/Flushable.java | 11 +- .../com/teragrep/pth10/steps/NullStep.java | 9 +- .../teragrep/pth10/steps/ParsedResult.java | 13 +- .../com/teragrep/pth10/steps/TypeParser.java | 42 +- .../teragrep/pth10/steps/accum/AccumStep.java | 71 +- .../pth10/steps/accum/IntermediateState.java | 21 +- .../addtotals/AddtotalsIntermediateState.java | 40 +- .../pth10/steps/addtotals/AddtotalsStep.java | 32 +- .../addtotals/MultiPrecisionValuePair.java | 13 +- .../steps/addtotals/NumericColumnSum.java | 15 +- .../pth10/steps/chart/AbstractChartStep.java | 8 +- .../teragrep/pth10/steps/chart/ChartStep.java | 11 +- .../steps/convert/AbstractConvertStep.java | 9 +- .../pth10/steps/convert/ConvertCommand.java | 12 +- .../pth10/steps/convert/ConvertStep.java | 109 +- .../pth10/steps/dedup/AbstractDedupStep.java | 8 +- .../teragrep/pth10/steps/dedup/DedupStep.java | 63 +- .../pth10/steps/dpl/AbstractDplStep.java | 9 +- .../com/teragrep/pth10/steps/dpl/DplStep.java | 9 +- .../pth10/steps/eval/AbstractEvalStep.java | 8 +- .../teragrep/pth10/steps/eval/EvalStep.java | 13 +- .../eventstats/AbstractEventstatsStep.java | 8 +- .../steps/eventstats/EventstatsStep.java | 21 +- .../steps/explain/AbstractExplainStep.java | 9 +- .../pth10/steps/explain/ExplainStep.java | 15 +- .../steps/fields/AbstractFieldsStep.java | 8 +- .../pth10/steps/fields/FieldsStep.java | 10 +- .../steps/fillnull/AbstractFillnullStep.java | 8 +- .../pth10/steps/fillnull/FillnullStep.java | 28 +- .../steps/format/AbstractFormatStep.java | 7 +- .../pth10/steps/format/FormatStep.java | 46 +- .../iplocation/AbstractIplocationStep.java | 11 +- .../steps/iplocation/IplocationStep.java | 51 +- .../pth10/steps/join/AbstractJoinStep.java | 8 +- .../teragrep/pth10/steps/join/JoinStep.java | 61 +- .../logicalCatalyst/LogicalCatalystStep.java | 10 +- .../steps/logicalXML/LogicalXMLStep.java | 19 +- .../makeresults/AbstractMakeresultsStep.java | 8 +- .../steps/makeresults/MakeresultsStep.java | 69 +- .../steps/predict/AbstractPredictStep.java | 10 +- .../pth10/steps/predict/PredictStep.java | 218 +-- .../steps/rangemap/AbstractRangemapStep.java | 7 +- .../pth10/steps/rangemap/RangemapStep.java | 11 +- .../pth10/steps/rangemap/RangemapUDF.java | 23 +- .../pth10/steps/regex/AbstractRegexStep.java | 8 +- .../teragrep/pth10/steps/regex/RegexStep.java | 11 +- .../steps/rename/AbstractRenameStep.java | 9 +- .../pth10/steps/rename/RenameStep.java | 10 +- .../steps/replace/AbstractReplaceStep.java | 8 +- .../pth10/steps/replace/ReplaceStep.java | 12 +- .../pth10/steps/rex/AbstractRexStep.java | 8 +- .../com/teragrep/pth10/steps/rex/RexStep.java | 21 +- .../pth10/steps/rex4j/AbstractRex4jStep.java | 8 +- .../teragrep/pth10/steps/rex4j/Rex4jStep.java | 31 +- .../steps/search/AbstractSearchStep.java | 7 +- .../pth10/steps/search/SearchStep.java | 8 +- .../sendemail/AbstractSendemailStep.java | 8 +- .../pth10/steps/sendemail/SendemailStep.java | 18 +- .../pth10/steps/sort/AbstractSortStep.java | 10 +- .../pth10/steps/sort/AggregatedSort.java | 7 +- .../teragrep/pth10/steps/sort/SortStep.java | 14 +- .../pth10/steps/spath/AbstractSpathStep.java | 8 +- .../teragrep/pth10/steps/spath/SpathStep.java | 45 +- .../pth10/steps/stats/AbstractStatsStep.java | 8 +- .../teragrep/pth10/steps/stats/StatsStep.java | 17 +- .../steps/strcat/AbstractStrcatStep.java | 8 +- .../pth10/steps/strcat/StrcatStep.java | 15 +- .../subsearch/AbstractSubsearchStep.java | 8 +- .../pth10/steps/subsearch/SubsearchStep.java | 41 +- .../pth10/steps/table/AbstractTableStep.java | 8 +- .../teragrep/pth10/steps/table/TableStep.java | 24 +- .../teragrep/DecompressibleInputStream.java | 7 +- .../steps/teragrep/TeragrepBloomStep.java | 69 +- .../steps/teragrep/TeragrepDynatraceStep.java | 38 +- .../teragrep/TeragrepHdfsDeleteStep.java | 35 +- .../steps/teragrep/TeragrepHdfsListStep.java | 58 +- .../steps/teragrep/TeragrepHdfsLoadStep.java | 211 ++- .../steps/teragrep/TeragrepHdfsSaveStep.java | 68 +- .../steps/teragrep/TeragrepHdfsStep.java | 26 +- .../steps/teragrep/TeragrepKafkaStep.java | 84 +- .../steps/teragrep/TeragrepSyslogStep.java | 12 +- .../steps/teragrep/TeragrepSystemStep.java | 32 +- .../BloomFilterForeachPartitionFunction.java | 11 +- .../teragrep/bloomfilter/FilterSizes.java | 28 +- .../teragrep/bloomfilter/LazyConnection.java | 29 +- .../bloomfilter/TeragrepBloomFilter.java | 55 +- .../teragrep/dynatrace/DynatraceItem.java | 27 +- .../teragrep/dynatrace/DynatraceMetadata.java | 23 +- .../timechart/AbstractTimechartStep.java | 8 +- .../pth10/steps/timechart/TimechartStep.java | 16 +- .../tokenizer/AbstractTokenizerStep.java | 15 +- .../pth10/steps/tokenizer/TokenizerStep.java | 29 +- .../pth10/steps/top/AbstractTopStep.java | 9 +- .../com/teragrep/pth10/steps/top/TopStep.java | 10 +- .../pth10/steps/where/AbstractWhereStep.java | 8 +- .../teragrep/pth10/steps/where/WhereStep.java | 10 +- .../pth10/steps/xmlkv/AbstractXmlkvStep.java | 7 +- .../teragrep/pth10/steps/xmlkv/XmlkvStep.java | 29 +- .../AccumTransformationStreamingTest.java | 213 ++- .../pth10/AddtotalsTransformationTest.java | 88 +- .../AggregateAfterSequentialCommandTest.java | 124 +- .../pth10/BloomFilterOperationsTest.java | 132 +- .../teragrep/pth10/CatalystVisitorTest.java | 166 +- .../pth10/ConvertTransformationTest.java | 1127 +++++++----- .../com/teragrep/pth10/DPLTimeFormatTest.java | 7 +- .../pth10/DedupTransformationTest.java | 43 +- .../teragrep/pth10/DefaultTimeFormatTest.java | 7 +- .../pth10/DynatraceTestAPICallback.java | 29 +- .../teragrep/pth10/EarliestLatestTest.java | 532 ++++-- .../pth10/EventstatsTransformationTest.java | 172 +- .../pth10/FillnullTransformationTest.java | 119 +- .../pth10/FormatTransformationTest.java | 368 ++-- .../pth10/IplocationTransformationTest.java | 333 ++-- .../pth10/JoinTransformationTest.java | 491 ++--- .../pth10/MakeresultsTransformationTest.java | 127 +- .../pth10/PredictTransformationTest.java | 135 +- .../pth10/RangemapTransformationTest.java | 195 +- .../pth10/RegexTransformationTest.java | 120 +- .../pth10/RenameTransformationTest.java | 64 +- .../pth10/ReplaceTransformationTest.java | 193 +- .../pth10/Rex4jTransformationTest.java | 237 +-- .../teragrep/pth10/RexTransformationTest.java | 405 +++-- .../pth10/SearchTransformationTest.java | 581 +++--- .../pth10/SendemailTransformationTest.java | 519 +++--- .../pth10/SortTransformationTest.java | 328 ++-- .../pth10/SpathTransformationTest.java | 518 ++++-- .../java/com/teragrep/pth10/StackTest.java | 324 ++-- .../pth10/StrcatTransformationTest.java | 190 +- .../com/teragrep/pth10/StreamingTestUtil.java | 124 +- .../pth10/SubsearchStreamingTest.java | 139 +- .../com/teragrep/pth10/SyslogStreamTest.java | 100 +- .../pth10/TableTransformationTest.java | 129 +- .../teragrep/pth10/TeragrepDynatraceTest.java | 117 +- .../com/teragrep/pth10/TeragrepKafkaTest.java | 139 +- .../pth10/TeragrepTransformationTest.java | 620 ++++--- .../pth10/TimechartStreamingTest.java | 212 ++- .../com/teragrep/pth10/TokenizerTest.java | 91 +- .../pth10/UnimplementedCommandTest.java | 51 +- .../com/teragrep/pth10/UnquotedTextTest.java | 7 +- .../pth10/WhereTransformationTest.java | 143 +- .../pth10/XmlkvTransformationTest.java | 141 +- .../pth10/chartTransformationTest.java | 869 ++++----- .../java/com/teragrep/pth10/commandTest.java | 73 +- .../java/com/teragrep/pth10/evalTest.java | 6 +- .../pth10/fieldTransformationTest.java | 300 ++-- .../com/teragrep/pth10/indexQueryTest.java | 44 +- .../teragrep/pth10/logicalOperationTest.java | 778 ++++---- .../com/teragrep/pth10/relativeTimeTest.java | 319 ++-- .../statsTransformationStreamingTest.java | 420 +++-- .../pth10/statsTransformationTest.java | 857 +++++---- .../teragrep/bloomfilter/FilterSizesTest.java | 44 +- .../bloomfilter/TeragrepBloomFilterTest.java | 106 +- .../com/teragrep/pth10/subSearchTest.java | 91 +- .../com/teragrep/pth10/syntaxErrorTest.java | 123 +- .../pth10/translationTests/ChartTest.java | 23 +- .../pth10/translationTests/ConvertTest.java | 7 +- .../pth10/translationTests/DedupTest.java | 16 +- .../pth10/translationTests/DplTest.java | 8 +- .../pth10/translationTests/EvalTest.java | 24 +- .../pth10/translationTests/ExplainTest.java | 7 +- .../pth10/translationTests/FieldsTest.java | 9 +- .../translationTests/IplocationTest.java | 7 +- .../pth10/translationTests/JoinTest.java | 17 +- .../translationTests/MakeresultsTest.java | 9 +- .../pth10/translationTests/PredictTest.java | 8 +- .../pth10/translationTests/RegexTest.java | 7 +- .../pth10/translationTests/RenameTest.java | 8 +- .../pth10/translationTests/ReplaceTest.java | 7 +- .../pth10/translationTests/Rex4jTest.java | 10 +- .../pth10/translationTests/RexTest.java | 6 +- .../pth10/translationTests/SendemailTest.java | 18 +- .../pth10/translationTests/SortTest.java | 14 +- .../pth10/translationTests/SpathTest.java | 16 +- .../pth10/translationTests/StatsTest.java | 27 +- .../pth10/translationTests/StrcatTest.java | 8 +- .../pth10/translationTests/TableTest.java | 7 +- .../pth10/translationTests/TeragrepTest.java | 45 +- .../pth10/translationTests/TimechartTest.java | 24 +- .../pth10/translationTests/TopTest.java | 11 +- .../pth10/translationTests/WhereTest.java | 24 +- src/test/java/com/teragrep/pth10/utils.java | 173 +- .../java/com/teragrep/pth10/whereTest.java | 476 +++-- 341 files changed, 18092 insertions(+), 14513 deletions(-) create mode 100644 eclipse-java-formatter.xml create mode 100644 license-header diff --git a/eclipse-java-formatter.xml b/eclipse-java-formatter.xml new file mode 100644 index 0000000..53b9f2a --- /dev/null +++ b/eclipse-java-formatter.xml @@ -0,0 +1,450 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/license-header b/license-header new file mode 100644 index 0000000..df3e194 --- /dev/null +++ b/license-header @@ -0,0 +1,45 @@ +/* + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ diff --git a/pom.xml b/pom.xml index ae84894..be4b918 100644 --- a/pom.xml +++ b/pom.xml @@ -1,34 +1,13 @@ - - jar + 4.0.0 + com.teragrep pth_10 ${revision}${sha1}${changelist} + jar pth_10 Data Processing Language (DPL) translator for Apache Spark https://teragrep.com - com.teragrep - - UTF-8 - 1.8 - 1.8 - 1.8 - 0.0.1 - -SNAPSHOT - - 5.7.1 - 4.0.1 - 1.7.6 - 6.1.4 - 0.4.3 - 4.3.0 - 3.1.2 - 3.0.0 - 10.0.1 - 3.1.1 - GNU Affero General Public License v3.0 @@ -60,6 +39,25 @@ scm:git:git@github.com:teragrep/pth_10.git https://github.com/teragrep/pth_10/tree/master + + -SNAPSHOT + 1.8 + 5.7.1 + 1.8 + 1.8 + UTF-8 + 0.0.1 + + 3.0.0 + 10.0.1 + 3.1.1 + 0.4.3 + 6.1.4 + 3.1.2 + 4.3.0 + 4.0.1 + 1.7.6 + @@ -165,11 +163,11 @@ 1.6.2 - - com.github.dhorions - boxable - 1.7.0 - + + com.github.dhorions + boxable + 1.7.0 + @@ -258,6 +256,59 @@ ${project.basedir}/target pth_10 + + com.diffplug.spotless + spotless-maven-plugin + 2.30.0 + + + + ${project.basedir}/eclipse-java-formatter.xml + 4.10.0 + + + + ${project.basedir}/license-header + + + + + + UTF-8 + \n + true + false + 2 + recommended_2008_06 + true + true + true + + + + + + .gitattributes + .gitignore + + + + + true + 4 + + + + + + + + check + + compile + + + org.apache.maven.plugins maven-enforcer-plugin @@ -276,7 +327,7 @@ [1.8,1.9) - + All plugins are required to contain specific version. org.apache.maven.plugins:maven-site-plugin,org.apache.maven.plugins:maven-resources-plugin,org.apache.maven.plugins:maven-clean-plugin,org.apache.maven.plugins:maven-install-plugin,org.apache.maven.plugins:maven-deploy-plugin @@ -323,21 +374,13 @@ apache-rat-plugin 0.15 false - - - test - - check - - - false Also allow the license url to be https. - https://github.com/teragrep/teragrep/blob/main/LICENSE + Copyright (C) 2019-2024 Suomen Kanuuna Oy @@ -362,8 +405,17 @@ src/test/resources/hdfslist/** src/test/resources/csv/** + eclipse-java-formatter.xml + + + + check + + test + + org.apache.maven.plugins @@ -397,18 +449,18 @@ - flatten - process-resources + flatten.clean - flatten + clean + clean - flatten.clean - clean + flatten - clean + flatten + process-resources @@ -419,10 +471,10 @@ add-source - process-sources add-source + process-sources target/generated-sources/antlr4 @@ -451,10 +503,10 @@ 1.0.0 - generate-resources write-project-properties + generate-resources ${project.build.outputDirectory}/maven.properties @@ -466,6 +518,17 @@ publish-maven-central + + + ossrh + Central Repository OSSRH + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + @@ -475,10 +538,10 @@ sign-artifacts - verify sign + verify --pinentry-mode @@ -490,20 +553,16 @@ + + + publish-github-packages - - ossrh - https://oss.sonatype.org/content/repositories/snapshots - - ossrh - Central Repository OSSRH - https://oss.sonatype.org/service/local/staging/deploy/maven2/ + github + GitHub Packages + https://maven.pkg.github.com/teragrep/pth_10 - - - publish-github-packages @@ -513,10 +572,10 @@ sign-artifacts - verify sign + verify --pinentry-mode @@ -528,13 +587,6 @@ - - - github - GitHub Packages - https://maven.pkg.github.com/teragrep/pth_10 - - diff --git a/src/main/java/com/teragrep/pth10/ast/DPLAuditInformation.java b/src/main/java/com/teragrep/pth10/ast/DPLAuditInformation.java index df50294..7f9725a 100644 --- a/src/main/java/com/teragrep/pth10/ast/DPLAuditInformation.java +++ b/src/main/java/com/teragrep/pth10/ast/DPLAuditInformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,17 +43,17 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; /** * Audit information for datasource auditing support */ public class DPLAuditInformation { - private String query=""; + + private String query = ""; private String reason = ""; - private String user =""; - private String teragrepAuditPluginClassName=""; + private String user = ""; + private String teragrepAuditPluginClassName = ""; public void setQuery(String query) { this.query = query; @@ -66,6 +66,7 @@ public String getQuery() { public void setReason(String reason) { this.reason = reason; } + public String getReason() { return reason; } @@ -73,6 +74,7 @@ public String getReason() { public void setUser(String user) { this.user = user; } + public String getUser() { return user; } @@ -80,6 +82,7 @@ public String getUser() { public void setTeragrepAuditPluginClassName(String teragrepAuditPluginClassName) { this.teragrepAuditPluginClassName = teragrepAuditPluginClassName; } + public String getTeragrepAuditPluginClassName() { return teragrepAuditPluginClassName; } diff --git a/src/main/java/com/teragrep/pth10/ast/DPLInternalStreamingQuery.java b/src/main/java/com/teragrep/pth10/ast/DPLInternalStreamingQuery.java index 7507fed..76f0c17 100644 --- a/src/main/java/com/teragrep/pth10/ast/DPLInternalStreamingQuery.java +++ b/src/main/java/com/teragrep/pth10/ast/DPLInternalStreamingQuery.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; import org.apache.spark.sql.streaming.StreamingQuery; @@ -54,6 +53,7 @@ * A wrapper class for Spark's StreamingQuery. Allows bundling additional metadata with the StreamingQuery object. */ public class DPLInternalStreamingQuery implements Serializable { + private StreamingQuery query; private long lastBatchId = -1; diff --git a/src/main/java/com/teragrep/pth10/ast/DPLInternalStreamingQueryListener.java b/src/main/java/com/teragrep/pth10/ast/DPLInternalStreamingQueryListener.java index 0f56ad0..3e9f708 100644 --- a/src/main/java/com/teragrep/pth10/ast/DPLInternalStreamingQueryListener.java +++ b/src/main/java/com/teragrep/pth10/ast/DPLInternalStreamingQueryListener.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; import com.teragrep.pth_06.ArchiveMicroStreamReader; @@ -63,8 +62,8 @@ import java.util.function.Consumer; /** - * StreamingQueryListener used to handle stopping all the streaming queries used internally in the - * DPL translation layer (pth_10). + * StreamingQueryListener used to handle stopping all the streaming queries used internally in the DPL translation layer + * (pth_10). */ public class DPLInternalStreamingQueryListener extends StreamingQueryListener implements Serializable { @@ -76,15 +75,12 @@ public class DPLInternalStreamingQueryListener extends StreamingQueryListener im private final Map _queries = new HashMap<>(); /** - * Used to send messages about the current streaming queries to the UI - * Map [K: String, V: Map[K:String, V:String]] + * Used to send messages about the current streaming queries to the UI Map [K: String, V: Map[K:String, V:String]] */ private Consumer>> _msgHandler; /** - * Contains information of all the queries - * Key = query id - * Value = map of query info key-value + * Contains information of all the queries Key = query id Value = map of query info key-value */ private final Map> _queryInfoMap = new HashMap<>(); @@ -94,6 +90,7 @@ public DPLInternalStreamingQueryListener() { /** * Initializes this listener in the specified sparkSession. + * * @param sparkSession sparkSession to initialize the listener in */ public void init(SparkSession sparkSession) { @@ -109,6 +106,7 @@ public void init(SparkSession sparkSession) { /** * Add message handler to this listener + * * @param handler Consumer of type String */ public void registerHandler(Consumer>> handler) { @@ -126,6 +124,7 @@ public void unregisterHandler() { /** * Emit on query start + * * @param queryStartedEvent Spark calls on query start */ @Override @@ -142,6 +141,7 @@ public void onQueryStarted(QueryStartedEvent queryStartedEvent) { /** * Emit on query progress + * * @param queryProgressEvent Spark calls on query progress */ @Override @@ -152,9 +152,11 @@ public void onQueryProgress(QueryProgressEvent queryProgressEvent) { internalKeyValueMap.put("name", nameOfQuery); internalKeyValueMap.put("status", "processing"); - internalKeyValueMap.put("processedRowsPerSecond", String.valueOf(queryProgressEvent.progress().processedRowsPerSecond())); + internalKeyValueMap + .put("processedRowsPerSecond", String.valueOf(queryProgressEvent.progress().processedRowsPerSecond())); internalKeyValueMap.put("batchId", String.valueOf(queryProgressEvent.progress().batchId())); - internalKeyValueMap.put("inputRowsPerSecond", String.valueOf(queryProgressEvent.progress().inputRowsPerSecond())); + internalKeyValueMap + .put("inputRowsPerSecond", String.valueOf(queryProgressEvent.progress().inputRowsPerSecond())); internalKeyValueMap.put("id", String.valueOf(queryProgressEvent.progress().id())); internalKeyValueMap.put("runId", String.valueOf(queryProgressEvent.progress().runId())); internalKeyValueMap.put("timestamp", queryProgressEvent.progress().timestamp()); @@ -166,12 +168,20 @@ public void onQueryProgress(QueryProgressEvent queryProgressEvent) { internalKeyValueMap.put("sourceDescription", queryProgressEvent.progress().sources()[0].description()); internalKeyValueMap.put("sourceStartOffset", queryProgressEvent.progress().sources()[0].startOffset()); internalKeyValueMap.put("sourceEndOffset", queryProgressEvent.progress().sources()[0].endOffset()); - internalKeyValueMap.put("sourceInputRowsPerSecond", String.valueOf(queryProgressEvent.progress().sources()[0].inputRowsPerSecond())); - internalKeyValueMap.put("sourceProcessedRowsPerSecond", String.valueOf(queryProgressEvent.progress().sources()[0].processedRowsPerSecond())); - internalKeyValueMap.put("sourceNumInputRows", String.valueOf(queryProgressEvent.progress().sources()[0].numInputRows())); + internalKeyValueMap + .put( + "sourceInputRowsPerSecond", + String.valueOf(queryProgressEvent.progress().sources()[0].inputRowsPerSecond()) + ); + internalKeyValueMap + .put( + "sourceProcessedRowsPerSecond", + String.valueOf(queryProgressEvent.progress().sources()[0].processedRowsPerSecond()) + ); + internalKeyValueMap + .put("sourceNumInputRows", String.valueOf(queryProgressEvent.progress().sources()[0].numInputRows())); } - // completion checking if (this.isRegisteredQuery(nameOfQuery)) { DPLInternalStreamingQuery sq = this.getQuery(nameOfQuery); @@ -183,7 +193,11 @@ public void onQueryProgress(QueryProgressEvent queryProgressEvent) { internalKeyValueMap.put("status", "complete"); if (!wasRemoved) { - LOGGER.error("Removing the query <{}> from the internal DPLStreamingQuery listener was unsuccessful!", nameOfQuery); + LOGGER + .error( + "Removing the query <{}> from the internal DPLStreamingQuery listener was unsuccessful!", + nameOfQuery + ); } } } @@ -198,6 +212,7 @@ public void onQueryProgress(QueryProgressEvent queryProgressEvent) { /** * Emit on query termination + * * @param queryTerminatedEvent Spark calls on query termination */ @Override @@ -218,8 +233,9 @@ public void onQueryTerminated(QueryTerminatedEvent queryTerminatedEvent) { /** * Starts a streaming query with the given name and registers it to the listener + * * @param name Name of the query, must be unique - * @param dsw DataStreamWriter to start + * @param dsw DataStreamWriter to start * @return StreamingQuery; use its awaitTermination to block */ public StreamingQuery registerQuery(final String name, DataStreamWriter dsw) { @@ -229,7 +245,8 @@ public StreamingQuery registerQuery(final String name, DataStreamWriter dsw else { try { this._queries.put(name, new DPLInternalStreamingQuery(dsw.queryName(name).start())); - } catch (TimeoutException e) { + } + catch (TimeoutException e) { LOGGER.error("Exception occurred on query start <{}>", e.getMessage(), e); throw new RuntimeException("Could not register query: " + e.getMessage()); } @@ -240,6 +257,7 @@ public StreamingQuery registerQuery(final String name, DataStreamWriter dsw /** * Remove query from the listener + * * @param name queryName * @return was removal successful (bool) */ @@ -251,12 +269,14 @@ public boolean removeQuery(String name) { /** * Stop query + * * @param name Name of the query to stop */ public void stopQuery(String name) { try { this._queries.get(name).getQuery().stop(); - } catch (TimeoutException e) { + } + catch (TimeoutException e) { LOGGER.error("Exception occurred on query stop <{}>", e.getMessage(), e); throw new RuntimeException("Exception occurred on query stop: " + e.getMessage()); } @@ -264,6 +284,7 @@ public void stopQuery(String name) { /** * Does the internal map contain the specified query + * * @param name queryName * @return bool */ @@ -273,6 +294,7 @@ public boolean isRegisteredQuery(String name) { /** * Returns the internal DPLInternalStreamingQuery object based on queryName + * * @param name queryName * @return DPLInternalStreamingQuery object */ @@ -282,6 +304,7 @@ private DPLInternalStreamingQuery getQuery(String name) { /** * Send message to messageHandler if it was registered + * * @param s message string */ private void sendMessageEvent(Map> s) { @@ -296,6 +319,7 @@ private void sendMessageEvent(Map> s) { /** * Check if the stream has provided all the data or if it is still in progress + * * @return is the stream complete */ private boolean checkCompletion(DPLInternalStreamingQuery sq) { @@ -318,6 +342,7 @@ private boolean checkCompletion(DPLInternalStreamingQuery sq) { /** * check if archive stream is done + * * @param sq StreamingQuery object * @return done? */ @@ -326,8 +351,10 @@ private boolean isArchiveDone(StreamingQuery sq) { for (int i = 0; i < sq.lastProgress().sources().length; i++) { SourceProgress progress = sq.lastProgress().sources()[i]; - - if (progress.description() != null && !progress.description().startsWith(ArchiveMicroStreamReader.class.getName().concat("@"))) { + if ( + progress.description() != null + && !progress.description().startsWith(ArchiveMicroStreamReader.class.getName().concat("@")) + ) { // ignore others than archive continue; } @@ -347,6 +374,7 @@ private boolean isArchiveDone(StreamingQuery sq) { /** * check if memory stream is done + * * @param sq StreamingQuery object * @return done? */ diff --git a/src/main/java/com/teragrep/pth10/ast/DPLParserCatalystContext.java b/src/main/java/com/teragrep/pth10/ast/DPLParserCatalystContext.java index a3bf66b..d730aa2 100644 --- a/src/main/java/com/teragrep/pth10/ast/DPLParserCatalystContext.java +++ b/src/main/java/com/teragrep/pth10/ast/DPLParserCatalystContext.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; import com.teragrep.pth10.steps.AbstractStep; @@ -60,9 +59,11 @@ import java.util.function.Consumer; /** - * Encapsulates parameters for Catalyst code generator. In addition to that offers access to sparkcontext and incoming datasource + * Encapsulates parameters for Catalyst code generator. In addition to that offers access to sparkcontext and incoming + * datasource */ public class DPLParserCatalystContext { + private static final Logger LOGGER = LoggerFactory.getLogger(DPLParserCatalystContext.class); SparkSession sparkSession; @@ -109,7 +110,8 @@ public void setMessageLogger(Consumer messageLogger) { public void logMessageToUI(String msg) { if (this.messageLogger != null) { this.messageLogger.accept(msg); - } else { + } + else { LOGGER.warn("Tried to log message <{}> to UI, but messageLogger was not set!", msg); } } @@ -121,7 +123,8 @@ public void setMetricsLogger(Consumer> metricsLogger) { public void sendMetrics(Dataset metricsDs) { if (this.metricsLogger != null) { this.metricsLogger.accept(metricsDs); - } else { + } + else { LOGGER.warn("Tried to send metrics via MetricsLogger, but it was not set."); } } @@ -168,6 +171,7 @@ public long getDplMinimumEarliest() { // timechart span private Long timeChartSpanSeconds = null; + public void setTimeChartSpanSeconds(Long timeChartSpanSeconds) { this.timeChartSpanSeconds = timeChartSpanSeconds; } @@ -178,11 +182,11 @@ public Long getTimeChartSpanSeconds() { // DPLInternalStreamingQueryListener private final DPLInternalStreamingQueryListener internalStreamingQueryListener; + public DPLInternalStreamingQueryListener getInternalStreamingQueryListener() { return internalStreamingQueryListener; } - /** * Used to flush the remaining rows to from commands (e.g. sendemail and kafka save) */ @@ -211,6 +215,7 @@ public Integer getDplRecallSize() { /** * Sets the base url to be used for linking to the search results in sent emails + * * @param newValue like https://teragrep.com */ public void setBaseUrl(String newValue) { @@ -219,6 +224,7 @@ public void setBaseUrl(String newValue) { /** * Sets the paragraph id for the search results link + * * @param newValue like paragraph_1658138772905_773043366 */ public void setParagraphUrl(String newValue) { @@ -227,6 +233,7 @@ public void setParagraphUrl(String newValue) { /** * Sets the notebook id for the search results link + * * @param newValue like 2H7AVWKCQ */ public void setNotebookUrl(String newValue) { @@ -235,6 +242,7 @@ public void setNotebookUrl(String newValue) { /** * Get the notebook id + * * @return notebook id */ public String getNotebookUrl() { @@ -244,6 +252,7 @@ public String getNotebookUrl() { /** * Builds the full link to the search results to be inserted to the sent emails.
* Based on data from {@link #baseUrl}, {@link #notebookUrl} and {@link #paragraphUrl} + * * @return full URL */ public String getUrl() { @@ -260,6 +269,7 @@ public String getUrl() { /** * Get paragraph id + * * @return paragraph id */ public String getParagraphUrl() { @@ -333,6 +343,7 @@ public StepList getStepList() { /** * Initialize context with spark session + * * @param sparkSession active session */ public DPLParserCatalystContext(SparkSession sparkSession) { @@ -344,8 +355,9 @@ public DPLParserCatalystContext(SparkSession sparkSession) { /** * Initialize context with spark session and incoming dataset + * * @param sparkSession active session - * @param ds {@literal DataSet} + * @param ds {@literal DataSet} */ public DPLParserCatalystContext(SparkSession sparkSession, Dataset ds) { this.sparkSession = sparkSession; @@ -359,8 +371,9 @@ public DPLParserCatalystContext(SparkSession sparkSession, Dataset ds) { /** * Initialize context with spark session and config which is created in zeppelin + * * @param sparkSession active session - * @param config Zeppelin configuration object + * @param config Zeppelin configuration object */ public DPLParserCatalystContext(SparkSession sparkSession, Config config) { this.sparkSession = sparkSession; @@ -370,13 +383,14 @@ public DPLParserCatalystContext(SparkSession sparkSession, Config config) { this.internalStreamingQueryListener.init(this.sparkSession); if (config != null) { // set earliest to now-24h if in zeppelin env, otherwise it will be 1970-01-01 - this.dplDefaultEarliest = Instant.now().truncatedTo(ChronoUnit.DAYS).getEpochSecond() - 24*60*60L; + this.dplDefaultEarliest = Instant.now().truncatedTo(ChronoUnit.DAYS).getEpochSecond() - 24 * 60 * 60L; this.dplMinimumEarliest = this.dplDefaultEarliest; } } /** * Get session + * * @return active spark session */ public SparkSession getSparkSession() { @@ -385,6 +399,7 @@ public SparkSession getSparkSession() { /** * Get current dataset + * * @return {@literal Dataset} */ public Dataset getDs() { @@ -393,6 +408,7 @@ public Dataset getDs() { /** * Set active or initial dataset. Used with tests + * * @param inDs {@literal Dataset} incoming dataset */ public void setDs(Dataset inDs) { @@ -401,6 +417,7 @@ public void setDs(Dataset inDs) { /** * Get current zeppelin config object + * * @return Zepplein config */ public Config getConfig() { @@ -444,11 +461,12 @@ public void setTestingMode(boolean testingMode) { } @Override - public DPLParserCatalystContext clone() { + public DPLParserCatalystContext clone() { DPLParserCatalystContext ctx; try { - ctx = (DPLParserCatalystContext)super.clone(); - } catch (CloneNotSupportedException e) { + ctx = (DPLParserCatalystContext) super.clone(); + } + catch (CloneNotSupportedException e) { LOGGER.debug("Clone not supported, create object copy"); ctx = new DPLParserCatalystContext(this.sparkSession); ctx.setParserConfig(parserConfig); diff --git a/src/main/java/com/teragrep/pth10/ast/DPLParserCatalystVisitor.java b/src/main/java/com/teragrep/pth10/ast/DPLParserCatalystVisitor.java index 6ed81c3..1ca8bfb 100644 --- a/src/main/java/com/teragrep/pth10/ast/DPLParserCatalystVisitor.java +++ b/src/main/java/com/teragrep/pth10/ast/DPLParserCatalystVisitor.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,10 +43,8 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; -import com.teragrep.functions.dpf_02.BatchCollect; import com.teragrep.pth10.ast.bo.*; import com.teragrep.pth10.ast.bo.Token.Type; import com.teragrep.pth10.ast.commands.logicalstatement.LogicalStatementCatalyst; @@ -64,8 +62,6 @@ import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.streaming.DataStreamWriter; -import org.apache.spark.sql.streaming.StreamingQueryException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -79,6 +75,7 @@ * Visitor used for Catalyst emit mode (main emit mode, XML emit mode only used for archive query) */ public class DPLParserCatalystVisitor extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(DPLParserCatalystVisitor.class); Node logicalPart = null; @@ -90,7 +87,6 @@ public class DPLParserCatalystVisitor extends DPLParserBaseVisitor { private final DPLParserCatalystContext catCtx; - // hdfs path - used for join command's subsearch save private String hdfsPath = null; @@ -117,8 +113,9 @@ public StepList getStepList() { } /** - * returns "trace buffer", which is an empty list for compatibility reasons. - * getTracebuffer is used in some older tests, which should be removed + * returns "trace buffer", which is an empty list for compatibility reasons. getTracebuffer is used in some older + * tests, which should be removed + * * @return empty list */ @Deprecated @@ -133,13 +130,14 @@ public Consumer> getConsumer() { /** * Sets the consumer to handle the results of each of the batches + * * @param consumer Consumer with type Dataset to be implemented in pth_07 */ public void setConsumer(Consumer> consumer) { this.stepList.setBatchHandler(consumer); } - public void setMessageHandler(Consumer>> messageHandler) { + public void setMessageHandler(Consumer>> messageHandler) { this.messageHandler = messageHandler; // register messageHandler to DPLInternalStreamingQueryListener if (this.catCtx != null && this.messageHandler != null) { @@ -152,35 +150,42 @@ public void setMessageHandler(Consumer>> messageH /** * Sets the maximum results for batchCollect + * * @param val int value */ public void setDPLRecallSize(Integer val) { - this.getCatalystContext().setDplRecallSize(val); + this.getCatalystContext().setDplRecallSize(val); } /** * Gets the dpl recall size (max results from batchCollect) + * * @return max results as int */ public Integer getDPLRecallSize() { - return this.getCatalystContext().getDplRecallSize(); + return this.getCatalystContext().getDplRecallSize(); } /** * HDFS path used for join subsearch save + * * @param path path as string */ public void setHdfsPath(String path) { - this.hdfsPath = path; + this.hdfsPath = path; } /** * HDFS path used for join/eventstats/subsearch save
* Generate a random path if none was set via setHdfsPath() + * * @return path as string */ public String getHdfsPath() { - if (this.hdfsPath == null && this.catCtx != null && this.catCtx.getSparkSession() != null && this.catCtx.getParagraphUrl() != null) { + if ( + this.hdfsPath == null && this.catCtx != null && this.catCtx.getSparkSession() != null + && this.catCtx.getParagraphUrl() != null + ) { final String appId = this.catCtx.getSparkSession().sparkContext().applicationId(); final String paragraphId = this.catCtx.getParagraphUrl(); final String path = String.format("/tmp/%s/%s/%s/", appId, paragraphId, UUID.randomUUID()); @@ -200,8 +205,9 @@ else if (this.hdfsPath == null) { } /** - * Sets the backup mmdb database path used by iplocation command - * Only used, if the zeppelin config item is not found. + * Sets the backup mmdb database path used by iplocation command Only used, if the zeppelin config item is not + * found. + * * @param iplocationMmdbPath new mmdb file path as string */ public void setIplocationMmdbPath(String iplocationMmdbPath) { @@ -209,16 +215,16 @@ public void setIplocationMmdbPath(String iplocationMmdbPath) { } /** - * Gets the backup mmdb database path used by iplocation command - * Only used, if the zeppelin config item is not found. + * Gets the backup mmdb database path used by iplocation command Only used, if the zeppelin config item is not + * found. + * * @return mmdb file path as string */ public String getIplocationMmdbPath() { return iplocationMmdbPath; } - public boolean getAggregatesUsed() - { + public boolean getAggregatesUsed() { return this.getStepList().getAggregateCount() > 0; } @@ -226,7 +232,7 @@ public boolean getAggregatesUsed() public String getLogicalPart() { String rv = null; if (logicalPart != null) { - ColumnNode cn = (ColumnNode)logicalPart; + ColumnNode cn = (ColumnNode) logicalPart; LOGGER.debug("\ngetLogicalPart incoming=<{}>", logicalPart); rv = cn.asExpression().sql(); } @@ -245,6 +251,7 @@ public Column getLogicalPartAsColumn() { /** * Get current DPLCatalystContext containing for instance audit information + * * @return DPLCatalystContext */ public DPLParserCatalystContext getCatalystContext() { @@ -269,7 +276,8 @@ public Node visitRoot(DPLParser.RootContext ctx) { if (ctx.searchTransformationRoot() != null) { LOGGER.info("visitRoot Handle logical part: <{}>", ctx.getChild(0).getText()); logicalPart = visitSearchTransformationRoot(ctx.searchTransformationRoot()); - } else { + } + else { // no logical part, e.g. makeresults or similar command in use without main search this.getStepList().add(new EmptyDataframeStep()); } @@ -290,11 +298,16 @@ public Node visitSearchTransformationRoot(DPLParser.SearchTransformationRootCont // Check for index= / index!= / index IN without right side if (ctx.getChildCount() == 1 && ctx.getChild(0) instanceof TerminalNode) { TerminalNode term = (TerminalNode) ctx.getChild(0); - if (term.getSymbol().getType() == DPLLexer.INDEX_EQ || term.getSymbol().getType() == DPLLexer.INDEX_SPACE - || term.getSymbol().getType() == DPLLexer.INDEX_NEG || term.getSymbol().getType() == DPLLexer.INDEX_SPACE_NEG - || term.getSymbol().getType() == DPLLexer.INDEX_IN) { - throw new RuntimeException("The right side of the search qualifier was empty! Check that the index has" + - " a valid value, like 'index = cinnamon'."); + if ( + term.getSymbol().getType() == DPLLexer.INDEX_EQ || term.getSymbol().getType() == DPLLexer.INDEX_SPACE + || term.getSymbol().getType() == DPLLexer.INDEX_NEG + || term.getSymbol().getType() == DPLLexer.INDEX_SPACE_NEG + || term.getSymbol().getType() == DPLLexer.INDEX_IN + ) { + throw new RuntimeException( + "The right side of the search qualifier was empty! Check that the index has" + + " a valid value, like 'index = cinnamon'." + ); } } @@ -324,14 +337,15 @@ public Node visitSearchTransformationRoot(DPLParser.SearchTransformationRootCont return new NullNode(); } - /** Used to visit subsearches. Builds a StepList for subsearch and returns SubsearchStep. + /** + * Used to visit subsearches. Builds a StepList for subsearch and returns SubsearchStep. * * @param ctx SubsearchTransformStatementContext * @return StepNode that has subsearchStep inside */ @Override public Node visitSubsearchStatement(DPLParser.SubsearchStatementContext ctx) { - DPLParser.SearchTransformationContext searchCtx = null; + DPLParser.SearchTransformationContext searchCtx = null; if (ctx.transformStatement() != null) { searchCtx = ctx.transformStatement().searchTransformation(); } @@ -342,14 +356,16 @@ public Node visitSubsearchStatement(DPLParser.SubsearchStatementContext ctx) { // data from other indices compared to main query Document xmlDoc; try { - xmlDoc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); - } catch (ParserConfigurationException e) { + xmlDoc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); + } + catch (ParserConfigurationException e) { throw new RuntimeException(e); } LogicalStatementXML logicalXml = new LogicalStatementXML(catCtx, xmlDoc); AbstractStep xmlStep = logicalXml.visitLogicalStatementXML(searchCtx.searchTransformationRoot()); - AbstractStep catalystStep = this.logicalCatalyst.visitLogicalStatementCatalyst(searchCtx.searchTransformationRoot()); + AbstractStep catalystStep = this.logicalCatalyst + .visitLogicalStatementCatalyst(searchCtx.searchTransformationRoot()); this.stepList.add(xmlStep); this.stepList.add(catalystStep); @@ -360,7 +376,8 @@ public Node visitSubsearchStatement(DPLParser.SubsearchStatementContext ctx) { transformStatement.visit(ctx.transformStatement().transformStatement()); } - } else { // no main search, check first transformStatement + } + else { // no main search, check first transformStatement if (ctx.transformStatement() != null) { transformStatement = new TransformStatement(catCtx, this); // Adding transformation steps to stepList is done in TransformStatement @@ -374,10 +391,9 @@ public Node visitSubsearchStatement(DPLParser.SubsearchStatementContext ctx) { } /** - * logicalStatement : macroStatement | subsearchStatement | sublogicalStatement - * | timeStatement | searchQualifier | Not logicalStatement | indexStatement | - * comparisonStatement | logicalStatement Or logicalStatement | logicalStatement - * And? logicalStatement ; + * logicalStatement : macroStatement | subsearchStatement | sublogicalStatement | timeStatement | searchQualifier | + * Not logicalStatement | indexStatement | comparisonStatement | logicalStatement Or logicalStatement | + * logicalStatement And? logicalStatement ; */ @Override public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { @@ -386,9 +402,9 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { /** * {@inheritDoc} - * - *

The default implementation returns the result of calling - * {@link #visitChildren} on {@code ctx}.

+ *

+ * The default implementation returns the result of calling {@link #visitChildren} on {@code ctx}. + *

*/ @Override public Node visitComparisonStatement(DPLParser.ComparisonStatementContext ctx) { @@ -397,8 +413,7 @@ public Node visitComparisonStatement(DPLParser.ComparisonStatementContext ctx) { } /** - * Time format handling - * timeStatement : timeFormatQualifier? timeQualifier ; + * Time format handling timeStatement : timeFormatQualifier? timeQualifier ; */ @Override public Node visitTimeStatement(DPLParser.TimeStatementContext ctx) { diff --git a/src/main/java/com/teragrep/pth10/ast/DPLParserConfig.java b/src/main/java/com/teragrep/pth10/ast/DPLParserConfig.java index e1b6a43..0d77751 100644 --- a/src/main/java/com/teragrep/pth10/ast/DPLParserConfig.java +++ b/src/main/java/com/teragrep/pth10/ast/DPLParserConfig.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; import com.teragrep.pth10.ast.time.RelativeTimeParser; @@ -58,10 +57,11 @@ import static java.lang.Math.abs; /** - * Configuration for parser/UI data transfer. For instance latest/earliest values used for default span/time window calculation - * and map which can contain additional named values. + * Configuration for parser/UI data transfer. For instance latest/earliest values used for default span/time window + * calculation and map which can contain additional named values. */ public class DPLParserConfig { + private static final Logger LOGGER = LoggerFactory.getLogger(DPLParserConfig.class); private Map config = new LinkedHashMap<>(); @@ -70,40 +70,41 @@ public class DPLParserConfig { /** * Get named value from config map + * * @param key Name string * @return value */ - public Object get(String key) - { + public Object get(String key) { return config.get(key); } /** * Put named value object into the map - * @param key name string + * + * @param key name string * @param value as object. String/int/... */ - public void put(String key, Object value) - { + public void put(String key, Object value) { config.put(key, value); } /** - * Get earliest flag which is used when calculating window ranges for different spans. + * Get earliest flag which is used when calculating window ranges for different spans. + * * @return earliest as string value */ - public String getEarliest() - { + public String getEarliest() { return (String) config.get("earliest"); } /** - * Set earliest flag with given string, If flag is relative, calculate it relative to current now-instance. Store + * Set earliest flag with given string, If flag is relative, calculate it relative to current now-instance. Store * also that calculated value as epoch + * * @param earliest string value like -1h or actual timestamp */ public void setEarliest(String earliest) { - long earliestEpoch =0; + long earliestEpoch = 0; Timestamp now = new Timestamp(System.currentTimeMillis()); RelativeTimeParser rtParser = new RelativeTimeParser(); @@ -111,7 +112,8 @@ public void setEarliest(String earliest) { try { RelativeTimestamp rtTimestamp = rtParser.parse(earliest); // can throw error if not relative timestamp earliestEpoch = rtTimestamp.calculate(now); - } catch (NumberFormatException ne) { + } + catch (NumberFormatException ne) { // absolute time earliestEpoch = new DefaultTimeFormat().getEpoch(earliest); } @@ -121,7 +123,8 @@ public void setEarliest(String earliest) { } /** - * Get latest flag which is used when calculating window ranges for different spans. + * Get latest flag which is used when calculating window ranges for different spans. + * * @return latest as string value */ public String getLatest() { @@ -129,8 +132,9 @@ public String getLatest() { } /** - * Set latest flag with given string, If flag is relative, calculate it relative to current now-instance. Store - * also that calculated value as epoch + * Set latest flag with given string, If flag is relative, calculate it relative to current now-instance. Store also + * that calculated value as epoch + * * @param latest string value like -1h or actual timestamp */ public void setLatest(String latest) { @@ -142,7 +146,8 @@ public void setLatest(String latest) { try { RelativeTimestamp rtTimestamp = rtParser.parse(latest); // can throw exception if not relative timestamp latestEpoch = rtTimestamp.calculate(now); - } catch (NumberFormatException ne) { + } + catch (NumberFormatException ne) { // absolute time latestEpoch = new DefaultTimeFormat().getEpoch(latest); } @@ -152,7 +157,8 @@ public void setLatest(String latest) { } /** - * Use config map and calculate default time range according to it. + * Use config map and calculate default time range according to it. + * * @return enum range values 10s,...,1M */ public TimeRange getTimeRange() { @@ -160,27 +166,33 @@ public TimeRange getTimeRange() { long r = 0; // Earliest set, latest not if (config.get("earliest") != null && config.get("latest") == null) { - r = System.currentTimeMillis() - (long)config.get("earliestEpoch"); - } else if (config.get("latest") != null && config.get("earliest") == null) { - r = (long)config.get("latestEpoch"); - } else if (config.get("earliest") != null && config.get("latest") != null) { + r = System.currentTimeMillis() - (long) config.get("earliestEpoch"); + } + else if (config.get("latest") != null && config.get("earliest") == null) { + r = (long) config.get("latestEpoch"); + } + else if (config.get("earliest") != null && config.get("latest") != null) { // Both set // Calculate time range according to latest-earliest - LOGGER.info("config=<[{}]>",config); - r = (long)config.get("latestEpoch") - (long)config.get("earliestEpoch"); + LOGGER.info("config=<[{}]>", config); + r = (long) config.get("latestEpoch") - (long) config.get("earliestEpoch"); } - if(r<0) - r=abs(r); - LOGGER.info("Calculated range=<{}>",r); - if (r <= 15 * 60 ) { + if (r < 0) + r = abs(r); + LOGGER.info("Calculated range=<{}>", r); + if (r <= 15 * 60) { rv = TimeRange.TEN_SECONDS; - } else if (r <= 60 * 60 ) { + } + else if (r <= 60 * 60) { rv = TimeRange.ONE_MINUTE; - } else if (r <= 4 * 60 * 60 ) { + } + else if (r <= 4 * 60 * 60) { rv = TimeRange.FIVE_MINUTES; - } else if (r <= 24 * 60 * 60 ) { + } + else if (r <= 24 * 60 * 60) { rv = TimeRange.THIRTY_MINUTES; - } else if (r > 30 * 24 * 60 * 60 ) { + } + else if (r > 30 * 24 * 60 * 60) { // Default max value is 1 day rv = TimeRange.ONE_DAY; } diff --git a/src/main/java/com/teragrep/pth10/ast/DPLTimeFormat.java b/src/main/java/com/teragrep/pth10/ast/DPLTimeFormat.java index 7c5bc20..c733d97 100644 --- a/src/main/java/com/teragrep/pth10/ast/DPLTimeFormat.java +++ b/src/main/java/com/teragrep/pth10/ast/DPLTimeFormat.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,15 +43,13 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; import java.text.ParseException; import java.text.SimpleDateFormat; /** - * For using the DPL custom timeformat like Java's SimpleDateFormat. - * Get a Date object with parse -function for example. + * For using the DPL custom timeformat like Java's SimpleDateFormat. Get a Date object with parse -function for example. */ public final class DPLTimeFormat { @@ -62,8 +60,9 @@ public DPLTimeFormat(String format) { } /** - * Create a SimpleDateFormat object from the given DPL specific timeformat. - * Allows all sorts of parsing and tampering with the time. + * Create a SimpleDateFormat object from the given DPL specific timeformat. Allows all sorts of parsing and + * tampering with the time. + * * @return SimpleDateFormat created from the DPLTimeFormat */ public SimpleDateFormat createSimpleDateFormat() { @@ -71,9 +70,10 @@ public SimpleDateFormat createSimpleDateFormat() { } /** - * Parses the time string and converts it to an unic epoch long. - * Uses the system timezone as default if timezone is not specified in the pattern (given in the constructor). - * For setting a timezone later on (or any other operation) you will have to use createSimpleDateFormat function. + * Parses the time string and converts it to an unic epoch long. Uses the system timezone as default if timezone is + * not specified in the pattern (given in the constructor). For setting a timezone later on (or any other operation) + * you will have to use createSimpleDateFormat function. + * * @param dplTime Time represented with the pattern * @return Unix Epoch * @throws ParseException when dplTime doesn't have the correct format @@ -86,31 +86,31 @@ public long getEpoch(String dplTime) throws ParseException { private String convertDplTimeFormatToJava(String dplTf) { dplTf = new UnquotedText(new TextString(dplTf)).read(); return dplTf - // Date - .replaceAll("%F", "y-MM-dd") // ISO 8601 %Y-%m-%d - .replaceAll("%y", "yy") // year without century (00-99) - .replaceAll("%Y", "y") // full year - .replaceAll("%m", "MM") // month 1-12 - .replaceAll("%d", "dd") // day 1-31 - .replaceAll("%b", "MMM") // abbrv. month name - .replaceAll("%B", "MMMM") // full month name - .replaceAll("%A", "EE") // full weekday name, e.g. "sunday" - .replaceAll("%a", "E") // abbrv. weekday name, e.g. "Sun" - .replaceAll("%j", "D") // day of year, 001-366 - .replaceAll("%w", "F") // weekday as decimal 0=sun 6=sat - // Time - .replaceAll("%H", "HH") // hour 0-23 - .replaceAll("%k", "H") // hour without leading zeroes - .replaceAll("%M", "mm") // minute 0-59 - .replaceAll("%S", "ss") // second 0-59 - .replaceAll("%I", "hh") // hour 1-12 - .replaceAll("%p", "a") // am/pm - .replaceAll("%T", "HH:mm:ss") // hour:min:sec - .replaceAll("%f", "SSS") // microsecs - // Time zone - .replaceAll("%Z", "zz") // timezone abbreviation - .replaceAll("%z", "X") // timezone offset +00:00 - // Other - .replaceAll("%%", "%"); // percent sign + // Date + .replaceAll("%F", "y-MM-dd") // ISO 8601 %Y-%m-%d + .replaceAll("%y", "yy") // year without century (00-99) + .replaceAll("%Y", "y") // full year + .replaceAll("%m", "MM") // month 1-12 + .replaceAll("%d", "dd") // day 1-31 + .replaceAll("%b", "MMM") // abbrv. month name + .replaceAll("%B", "MMMM") // full month name + .replaceAll("%A", "EE") // full weekday name, e.g. "sunday" + .replaceAll("%a", "E") // abbrv. weekday name, e.g. "Sun" + .replaceAll("%j", "D") // day of year, 001-366 + .replaceAll("%w", "F") // weekday as decimal 0=sun 6=sat + // Time + .replaceAll("%H", "HH") // hour 0-23 + .replaceAll("%k", "H") // hour without leading zeroes + .replaceAll("%M", "mm") // minute 0-59 + .replaceAll("%S", "ss") // second 0-59 + .replaceAll("%I", "hh") // hour 1-12 + .replaceAll("%p", "a") // am/pm + .replaceAll("%T", "HH:mm:ss") // hour:min:sec + .replaceAll("%f", "SSS") // microsecs + // Time zone + .replaceAll("%Z", "zz") // timezone abbreviation + .replaceAll("%z", "X") // timezone offset +00:00 + // Other + .replaceAll("%%", "%"); // percent sign } } diff --git a/src/main/java/com/teragrep/pth10/ast/DefaultTimeFormat.java b/src/main/java/com/teragrep/pth10/ast/DefaultTimeFormat.java index 99144f4..321eb44 100644 --- a/src/main/java/com/teragrep/pth10/ast/DefaultTimeFormat.java +++ b/src/main/java/com/teragrep/pth10/ast/DefaultTimeFormat.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; import java.text.ParseException; @@ -51,16 +50,15 @@ import java.util.Date; /** - * Parser for the three default timeformats that can be used: - * 1. MM/dd/yyyy:HH:mm:ss - * 2. ISO 8601 with timezone offset, e.g. 2011-12-03T10:15:30+01:00 - * 3. ISO 8601 without offset, e.g. 2011-12-03T10:15:30 - * When timezone is not specified, uses the system default + * Parser for the three default timeformats that can be used: 1. MM/dd/yyyy:HH:mm:ss 2. ISO 8601 with timezone offset, + * e.g. 2011-12-03T10:15:30+01:00 3. ISO 8601 without offset, e.g. 2011-12-03T10:15:30 When timezone is not specified, + * uses the system default */ public class DefaultTimeFormat { /** * Calculate the epoch from given string. + * * @param time The human-readable time * @return epoch as long */ @@ -70,6 +68,7 @@ public long getEpoch(String time) { /** * Parses the given human-readable time to a Date object. + * * @param time The human-readable time * @return Date parsed from the given string */ @@ -84,21 +83,25 @@ public Date parse(String time) { // Use default format (MM/dd/yyyy:HH:mm:ss) // Use system default timezone date = this.parseDate(time, "MM/dd/yyyy:HH:mm:ss"); - } else if (attempt == 1) { + } + else if (attempt == 1) { // On first fail, try ISO 8601 with timezone offset, e.g. '2011-12-03T10:15:30+01:00' date = this.parseDate(time, "yyyy-MM-dd'T'HH:mm:ssXXX"); - } else { + } + else { // On second fail, try ISO 8601 without offset, e.g. '2011-12-03T10:15:30' // Use system default timezone date = this.parseDate(time, "yyyy-MM-dd'T'HH:mm:ss"); } break; - } catch (ParseException e) { + } + catch (ParseException e) { if (attempt > 1) { throw new RuntimeException("TimeQualifier conversion error: <" + time + "> can't be parsed."); } - } finally { + } + finally { attempt++; } } diff --git a/src/main/java/com/teragrep/pth10/ast/MapTypeColumn.java b/src/main/java/com/teragrep/pth10/ast/MapTypeColumn.java index f82f82b..d52db92 100644 --- a/src/main/java/com/teragrep/pth10/ast/MapTypeColumn.java +++ b/src/main/java/com/teragrep/pth10/ast/MapTypeColumn.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; import org.apache.spark.sql.Dataset; @@ -74,13 +73,15 @@ public MapTypeColumn(Dataset dataset, String columnName, DPLParserCatalystC /** * Extracts the keys from the Map of the column. + * * @return Keys as a Set of Strings */ public Set getKeys() throws StreamingQueryException { Set keys; if (dataset.isStreaming()) { keys = this.getKeysParallel(); - } else { + } + else { keys = this.getKeysSequential(); } return keys; @@ -90,15 +91,10 @@ public Set getKeys() throws StreamingQueryException { private Set getKeysSequential() { final Set keys = new HashSet<>(); - dataset.select( - functions.explode( - functions.map_keys( - functions.col(this.columnName) - ) - ) - ) - .collectAsList() - .forEach(r -> keys.add(r.getString(0))); + dataset + .select(functions.explode(functions.map_keys(functions.col(this.columnName)))) + .collectAsList() + .forEach(r -> keys.add(r.getString(0))); return keys; } @@ -111,19 +107,14 @@ private Set getKeysParallel() throws StreamingQueryException { if (dataset.isStreaming()) { // parallel mode final String id = UUID.randomUUID().toString(); final String name = "keys_" + id; - DataStreamWriter writer = dataset.select( - functions.explode( - functions.map_keys( - functions.col(this.columnName) - ) - ) - ).writeStream().foreachBatch((batchDs, batchId) -> { - // Get all the map's keys - // e.g. key->value; key2->value2 ==> key, key2 - batchDs - .collectAsList() - .forEach(r -> keys.add(r.getString(0))); - }); + DataStreamWriter writer = dataset + .select(functions.explode(functions.map_keys(functions.col(this.columnName)))) + .writeStream() + .foreachBatch((batchDs, batchId) -> { + // Get all the map's keys + // e.g. key->value; key2->value2 ==> key, key2 + batchDs.collectAsList().forEach(r -> keys.add(r.getString(0))); + }); StreamingQuery sq = this.catCtx.getInternalStreamingQueryListener().registerQuery(name, writer); sq.awaitTermination(); diff --git a/src/main/java/com/teragrep/pth10/ast/NullValue.java b/src/main/java/com/teragrep/pth10/ast/NullValue.java index 33cd15c..72f53b8 100644 --- a/src/main/java/com/teragrep/pth10/ast/NullValue.java +++ b/src/main/java/com/teragrep/pth10/ast/NullValue.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -48,32 +48,36 @@ import java.io.Serializable; /** - * Object, that is used to provide the value to be used as the - * null value across the project. + * Object, that is used to provide the value to be used as the null value across the project. */ public class NullValue implements Serializable { + private static final long serialVersionUID = 1L; /** - *

Defines which type of null to use as the null value.

+ *

+ * Defines which type of null to use as the null value. + *

*
    - *
  • {@link Type#DEFAULT_NULL}: Java null value
  • - *
  • {@link Type#EMPTY_STRING}: "" string
  • - *
  • {@link Type#NULL_AS_STRING}: "null" string
  • + *
  • {@link Type#DEFAULT_NULL}: Java null value
  • + *
  • {@link Type#EMPTY_STRING}: "" string
  • + *
  • {@link Type#NULL_AS_STRING}: "null" string
  • *
*/ public enum Type { - DEFAULT_NULL, - EMPTY_STRING, - NULL_AS_STRING + DEFAULT_NULL, EMPTY_STRING, NULL_AS_STRING } + private final Type type; + public NullValue() { this.type = Type.DEFAULT_NULL; } + public NullValue(Type type) { this.type = type; } + public String value() { switch (type) { case DEFAULT_NULL: diff --git a/src/main/java/com/teragrep/pth10/ast/NumericText.java b/src/main/java/com/teragrep/pth10/ast/NumericText.java index a4e9e04..1660e2c 100644 --- a/src/main/java/com/teragrep/pth10/ast/NumericText.java +++ b/src/main/java/com/teragrep/pth10/ast/NumericText.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -48,24 +48,28 @@ import java.util.regex.Pattern; public class NumericText implements Text { + private final Pattern numericPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); private final Text origin; public NumericText(Text text) { this.origin = text; } + @Override public String read() { final String s = origin.read(); if (isNumeric(s)) { return s; - } else { + } + else { throw new RuntimeException("Non-numeric text was provided!"); } } /** * Checks if given string is numeric + * * @param s any string * @return was the string numeric? */ diff --git a/src/main/java/com/teragrep/pth10/ast/PrettyTree.java b/src/main/java/com/teragrep/pth10/ast/PrettyTree.java index 5cd1b7c..1740744 100644 --- a/src/main/java/com/teragrep/pth10/ast/PrettyTree.java +++ b/src/main/java/com/teragrep/pth10/ast/PrettyTree.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; import com.teragrep.pth_03.shaded.org.antlr.v4.runtime.misc.Utils; @@ -71,8 +70,9 @@ public PrettyTree(final Tree t, final List ruleNames) { } /** - * Pretty print out a whole tree. getNodeText is used on the node payloads to get the text - * for the nodes. (Derived from Trees.toStringTree(....)) + * Pretty print out a whole tree. getNodeText is used on the node payloads to get the text for the nodes. (Derived + * from Trees.toStringTree(....)) + * * @return pretty tree as string */ public String getTree() { @@ -81,7 +81,8 @@ public String getTree() { } private String process(final Tree t, final List ruleNames) { - if (t.getChildCount() == 0) return Utils.escapeWhitespace(Trees.getNodeText(t, ruleNames), false); + if (t.getChildCount() == 0) + return Utils.escapeWhitespace(Trees.getNodeText(t, ruleNames), false); StringBuilder sb = new StringBuilder(); sb.append(lead(level)); level++; @@ -105,4 +106,4 @@ private String lead(int level) { } return sb.toString(); } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/StepList.java b/src/main/java/com/teragrep/pth10/ast/StepList.java index 319b002..846fd32 100644 --- a/src/main/java/com/teragrep/pth10/ast/StepList.java +++ b/src/main/java/com/teragrep/pth10/ast/StepList.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -68,6 +68,7 @@ import java.util.function.Consumer; public class StepList implements VoidFunction2, Long> { + private static final Logger LOGGER = LoggerFactory.getLogger(StepList.class); private final List list; private int breakpoint = -1; @@ -76,7 +77,7 @@ public class StepList implements VoidFunction2, Long> { private boolean ignoreDefaultSorting = false; private OutputMode outputMode = OutputMode.Append(); - private Consumer> batchHandler = null; // for UI + private Consumer> batchHandler = null; // for UI private BatchCollect batchCollect; // standard batchCollect, used before sending batch event private BatchCollect sequentialModeBatchCollect; // used if in append mode and in sequential, to allow aggregates in sequential mode private DPLParserCatalystVisitor catVisitor; @@ -107,6 +108,7 @@ public StepList(DPLParserCatalystVisitor catVisitor) { /** * Add the specified step to the StepList + * * @param step step to add * @return if adding was a success */ @@ -115,9 +117,10 @@ public boolean add(AbstractStep step) { } /** - * Returns a map containing the field names and their values as toString() for those values provides. - * If the given index is invalid, returns null. If the value of a field cannot be accessed, - * returns a ??? value in the map instead. + * Returns a map containing the field names and their values as toString() for those values provides. If the given + * index is invalid, returns null. If the value of a field cannot be accessed, returns a ??? value in the map + * instead. + * * @param i index between 0 and size-1 of the internal list * @return mapping of field-value */ @@ -129,18 +132,21 @@ public Map getParamsOf(int i) { f.setAccessible(true); try { rv.put(f.getName(), f.get(this.list.get(i)).toString()); - } catch (IllegalAccessException e) { + } + catch (IllegalAccessException e) { rv.put(f.getName(), "???"); } } return rv; - } else { + } + else { return null; } } /** * returns the count of aggregates currently processed + * * @return the count */ public int getAggregateCount() { @@ -149,6 +155,7 @@ public int getAggregateCount() { /** * Execute the steps included in the list + * * @return DataStreamWriter which can be used to start the query */ public DataStreamWriter execute() throws StreamingQueryException { @@ -177,16 +184,12 @@ private DataStreamWriter executeFromStep(int fromStepIndex, Dataset ds // Switch to sequential; aka run the step inside forEachBatch LOGGER.debug("breakpoint encountered at index <{}>", i); - return ds - .writeStream() - .outputMode(this.outputMode) - .foreachBatch(this); + return ds.writeStream().outputMode(this.outputMode).foreachBatch(this); } ds = step.get(ds); } - return ds.writeStream().outputMode(this.outputMode) - .foreachBatch(this); + return ds.writeStream().outputMode(this.outputMode).foreachBatch(this); } private Dataset executeInBatch(Dataset ds) throws StreamingQueryException { @@ -216,17 +219,16 @@ private void analyze() { step.setAggregatesUsedBefore(aggregateCount > 0); - if (step.hasProperty(AbstractStep.CommandProperty.USES_INTERNAL_BATCHCOLLECT)){ + if (step.hasProperty(AbstractStep.CommandProperty.USES_INTERNAL_BATCHCOLLECT)) { LOGGER.info("[Analyze] Step uses internal batch collect: <{}>", step); this.useInternalBatchCollect = true; this.batchCollect = null; } if (step.hasProperty(AbstractStep.CommandProperty.IGNORE_DEFAULT_SORTING)) { - LOGGER.info("[Analyze] Ignore default sorting: <{}>",step); + LOGGER.info("[Analyze] Ignore default sorting: <{}>", step); this.ignoreDefaultSorting = true; - this.batchCollect = new BatchCollect(null, - catVisitor.getDPLRecallSize()); + this.batchCollect = new BatchCollect(null, catVisitor.getDPLRecallSize()); } if (step.hasProperty(AbstractStep.CommandProperty.REQUIRE_PRECEDING_AGGREGATE)) { @@ -241,7 +243,8 @@ private void analyze() { if (breakpoint == -1) { breakpoint = i; } - } else if (step.hasProperty(AbstractStep.CommandProperty.AGGREGATE)) { + } + else if (step.hasProperty(AbstractStep.CommandProperty.AGGREGATE)) { LOGGER.info("[Analyze] Aggregate command: <{}>", step); aggregateCount++; @@ -257,6 +260,7 @@ private void analyze() { /** * Sends the processed batch to the {@link #batchHandler}
* This is where any possible sorting happens through dpf_02 + * * @param ds Processed batch dataset * @param id ID of the processed batch dataset */ @@ -288,40 +292,47 @@ public void call(Dataset batchDF, Long batchId) throws StreamingQueryExcept final long max = catVisitor.getCatalystContext().getDplMaximumLatest(); final long step = catVisitor.getCatalystContext().getTimeChartSpanSeconds(); - final Dataset rangeDs = - catVisitor.getCatalystContext() - .getSparkSession() - .range((min/step)*step, - ((max/step)+1) * step, step) - .select(functions.col("id").cast("timestamp").alias("_range")); + final Dataset rangeDs = catVisitor + .getCatalystContext() + .getSparkSession() + .range((min / step) * step, ((max / step) + 1) * step, step) + .select(functions.col("id").cast("timestamp").alias("_range")); // left join span to data & continue - batchDF = rangeDs.join( - batchDF, - rangeDs.col("_range").equalTo(batchDF.col("_time")), "left") + batchDF = rangeDs + .join(batchDF, rangeDs.col("_range").equalTo(batchDF.col("_time")), "left") .drop("_time") .withColumnRenamed("_range", "_time") .orderBy("_time"); - // fill null data with "0" for all types, except for the "_time" column for (final StructField field : batchDF.schema().fields()) { final String name = field.name(); final DataType dataType = field.dataType(); if (dataType == DataTypes.StringType) { - batchDF = batchDF.na().fill("0", new String[]{name}); + batchDF = batchDF.na().fill("0", new String[] { + name + }); } else if (dataType == DataTypes.IntegerType) { - batchDF = batchDF.na().fill(0, new String[]{name}); + batchDF = batchDF.na().fill(0, new String[] { + name + }); } else if (dataType == DataTypes.LongType) { - batchDF = batchDF.na().fill(0L, new String[]{name}); + batchDF = batchDF.na().fill(0L, new String[] { + name + }); } else if (dataType == DataTypes.DoubleType) { - batchDF = batchDF.na().fill(0d, new String[]{name}); + batchDF = batchDF.na().fill(0d, new String[] { + name + }); } else if (dataType == DataTypes.FloatType) { - batchDF = batchDF.na().fill(0f, new String[]{name}); + batchDF = batchDF.na().fill(0f, new String[] { + name + }); } // skip TimestampType } @@ -329,7 +340,11 @@ else if (dataType == DataTypes.FloatType) { // Continue sub list of steps execution, if necessary if (!this.list.isEmpty()) { - LOGGER.info("StepList batch processing - Continuing execution to next ops after breakpoint index: <{}>", breakpoint); + LOGGER + .info( + "StepList batch processing - Continuing execution to next ops after breakpoint index: <{}>", + breakpoint + ); Dataset ret = this.executeInBatch(batchDF); diff --git a/src/main/java/com/teragrep/pth10/ast/Text.java b/src/main/java/com/teragrep/pth10/ast/Text.java index b4b1c20..81c4ca5 100644 --- a/src/main/java/com/teragrep/pth10/ast/Text.java +++ b/src/main/java/com/teragrep/pth10/ast/Text.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,13 +43,12 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; /** - * Interface for any type of text. - * e.g. gotten from a String, File, Stream etc. + * Interface for any type of text. e.g. gotten from a String, File, Stream etc. */ public interface Text { + String read(); } diff --git a/src/main/java/com/teragrep/pth10/ast/TextString.java b/src/main/java/com/teragrep/pth10/ast/TextString.java index 4998b49..9fdc207 100644 --- a/src/main/java/com/teragrep/pth10/ast/TextString.java +++ b/src/main/java/com/teragrep/pth10/ast/TextString.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; /** @@ -60,7 +59,8 @@ public TextString(String text) { public TextString(Object text) { if (text == null) { this.text = ""; - } else { + } + else { this.text = text.toString(); } } diff --git a/src/main/java/com/teragrep/pth10/ast/TimeRange.java b/src/main/java/com/teragrep/pth10/ast/TimeRange.java index 76f069c..8a919ee 100644 --- a/src/main/java/com/teragrep/pth10/ast/TimeRange.java +++ b/src/main/java/com/teragrep/pth10/ast/TimeRange.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,8 +43,8 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; -public enum TimeRange {TEN_SECONDS,ONE_MINUTE,FIVE_MINUTES, THIRTY_MINUTES, ONE_HOUR, ONE_DAY, ONE_MONTH} - +public enum TimeRange { + TEN_SECONDS, ONE_MINUTE, FIVE_MINUTES, THIRTY_MINUTES, ONE_HOUR, ONE_DAY, ONE_MONTH +} diff --git a/src/main/java/com/teragrep/pth10/ast/UnquotedText.java b/src/main/java/com/teragrep/pth10/ast/UnquotedText.java index a6401a0..ef9a1ce 100644 --- a/src/main/java/com/teragrep/pth10/ast/UnquotedText.java +++ b/src/main/java/com/teragrep/pth10/ast/UnquotedText.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast; import java.util.regex.Matcher; @@ -67,6 +66,7 @@ public String read() { /** * Strips quotes + * * @return string with stripped quotes */ private String stripQuotes(String quoted) { @@ -77,7 +77,8 @@ private String stripQuotes(String quoted) { // check "-quotes if (m.find()) { strUnquoted = m.group(1); - } else { + } + else { // check '-quotes if (m1.find()) { strUnquoted = m1.group(1); diff --git a/src/main/java/com/teragrep/pth10/ast/bo/CatalystNode.java b/src/main/java/com/teragrep/pth10/ast/bo/CatalystNode.java index 0810a82..86d73f1 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/CatalystNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/CatalystNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; import org.apache.spark.sql.Dataset; @@ -54,39 +53,41 @@ * Node that can contain a Dataset and a DataStreamWriter */ public class CatalystNode extends Node { - Dataset val = null; - DataStreamWriter dsw = null; - - public CatalystNode(Token token) { - super(token); - } - public CatalystNode(Dataset ds) { - this.val = ds; - } - public CatalystNode(DataStreamWriter dsw) { - this.dsw = dsw; - } + Dataset val = null; + DataStreamWriter dsw = null; + + public CatalystNode(Token token) { + super(token); + } + + public CatalystNode(Dataset ds) { + this.val = ds; + } + + public CatalystNode(DataStreamWriter dsw) { + this.dsw = dsw; + } + + public Dataset getDataset() { + return val; + } + + public void setDataStreamWriter(DataStreamWriter dsw) { + this.dsw = dsw; + } - public Dataset getDataset(){ - return val; - } - - public void setDataStreamWriter(DataStreamWriter dsw) { - this.dsw = dsw; - } - - public DataStreamWriter getDataStreamWriter() { - return dsw; - } + public DataStreamWriter getDataStreamWriter() { + return dsw; + } - public String toString() { - if (val != null) { - String str = val.toString(); - return str; - } - else { - return "null"; - } - } + public String toString() { + if (val != null) { + String str = val.toString(); + return str; + } + else { + return "null"; + } + } } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/ColumnNode.java b/src/main/java/com/teragrep/pth10/ast/bo/ColumnNode.java index 5ee0ecd..214deac 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/ColumnNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/ColumnNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; import org.apache.spark.sql.Column; @@ -55,34 +54,38 @@ * Node that can contain a Column */ public class ColumnNode extends Node { - Column val = null; - public ColumnNode() { - super(); - } - public ColumnNode(Token token) { - super(token); - } - public ColumnNode(Column col) { - this.val = col; - } - public Column getColumn(){ - return val; - } + Column val = null; + + public ColumnNode() { + super(); + } + + public ColumnNode(Token token) { + super(token); + } + + public ColumnNode(Column col) { + this.val = col; + } + + public Column getColumn() { + return val; + } - public String toString() { - String str = val.toString(); - return str; - } + public String toString() { + String str = val.toString(); + return str; + } - public Expression asExpression(){ - return val.expr(); - } + public Expression asExpression() { + return val.expr(); + } - public Dataset asDataset(Datasetds){ - Datasetrv = null; - if(ds != null) - rv = ds.where(val); - return rv; - } + public Dataset asDataset(Dataset ds) { + Dataset rv = null; + if (ds != null) + rv = ds.where(val); + return rv; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/ElementNode.java b/src/main/java/com/teragrep/pth10/ast/bo/ElementNode.java index 15f1cc2..7e47a56 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/ElementNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/ElementNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; import org.w3c.dom.Element; @@ -57,45 +56,52 @@ * Node that can contain an (XML) element */ public class ElementNode extends Node { - Element val = null; - public ElementNode(Token token) { - super(token); - } - public ElementNode(Element element) { - this.val = element; - } - public Element getElement(){ - return val; - } + Element val = null; + + public ElementNode(Token token) { + super(token); + } + + public ElementNode(Element element) { + this.val = element; + } + + public Element getElement() { + return val; + } - public String toString() { - String str = null; - try { - TransformerFactory transFactory = TransformerFactory.newInstance(); - Transformer transformer = transFactory.newTransformer(); - StringWriter buffer = new StringWriter(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - transformer.transform(new DOMSource(val), - new StreamResult(buffer)); - str = buffer.toString(); - }catch(TransformerConfigurationException tex){} - catch(TransformerException ex){} - return str; - } + public String toString() { + String str = null; + try { + TransformerFactory transFactory = TransformerFactory.newInstance(); + Transformer transformer = transFactory.newTransformer(); + StringWriter buffer = new StringWriter(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + transformer.transform(new DOMSource(val), new StreamResult(buffer)); + str = buffer.toString(); + } + catch (TransformerConfigurationException tex) { + } + catch (TransformerException ex) { + } + return str; + } - public static String toString(Element val) { - String str = null; - try { - TransformerFactory transFactory = TransformerFactory.newInstance(); - Transformer transformer = transFactory.newTransformer(); - StringWriter buffer = new StringWriter(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - transformer.transform(new DOMSource(val), - new StreamResult(buffer)); - str = buffer.toString(); - }catch(TransformerConfigurationException tex){} - catch(TransformerException ex){} - return str; - } + public static String toString(Element val) { + String str = null; + try { + TransformerFactory transFactory = TransformerFactory.newInstance(); + Transformer transformer = transFactory.newTransformer(); + StringWriter buffer = new StringWriter(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + transformer.transform(new DOMSource(val), new StreamResult(buffer)); + str = buffer.toString(); + } + catch (TransformerConfigurationException tex) { + } + catch (TransformerException ex) { + } + return str; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/ListNode.java b/src/main/java/com/teragrep/pth10/ast/bo/ListNode.java index 936f452..65bf728 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/ListNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/ListNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; import java.util.ArrayList; @@ -51,9 +50,11 @@ /** * Node that can contain an ArrayList of type T + * * @param type for arraylist */ public class ListNode extends Node { + private List list = new ArrayList<>(); public ListNode(Token token) { @@ -61,26 +62,26 @@ public ListNode(Token token) { } public ListNode(List lst) { - list=lst; + list = lst; } public ListNode() { super(); } - public void add(T s){ + public void add(T s) { list.add(s); } - public void add(int i, T s){ - list.add(i,s); + public void add(int i, T s) { + list.add(i, s); } - public T get(int i){ + public T get(int i) { return list.get(i); } - public List asList(){ + public List asList() { return list; } } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/Node.java b/src/main/java/com/teragrep/pth10/ast/bo/Node.java index 1bfb38c..172563e 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/Node.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/Node.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; import java.util.ArrayList; @@ -53,96 +52,96 @@ * Base abstract class for all Nodes */ public abstract class Node { - - com.teragrep.pth10.ast.bo.Token token; - List children; - - public Node() { - ; // disconnected ones for lists etc - } - - public Node(com.teragrep.pth10.ast.bo.Token token ) { - this.token = token; - } - - // Convenience API to get token base type - public Token.Type getNodeType(){ - return token.getType(); - } - - public void addChild(Node child) { - if (this.children == null) { - this.children = new ArrayList(); - } - this.children.add(child); - } - - public List getChildren(){ - return this.children; - } - - public String toString() { - if (this.token != null) { - return this.token.toString(); - } - else { - return "null"; - } - } - - public String toTree() { - if (children != null && children.size() > 0) { - // recurse children - StringBuilder subTree = new StringBuilder(); - - subTree.append("{"); - subTree.append(this.toString()); - subTree.append(" "); - - // their - int nChild = this.children.size(); - while (nChild>0) { - Node child = children.get(nChild-1); - subTree.append(" "); - subTree.append(child.toTree()); - nChild--; - } - - subTree.append("}"); - - return subTree.toString(); - } - else { - // leaf - return this.toString(); - } - } - - public String toXMLTree() { - if (children != null && children.size() > 0) { - // recurse children - StringBuilder subTree = new StringBuilder(); - - subTree.append("<"); - subTree.append(this.toString()); - subTree.append(" "); - - // their - int nChild = this.children.size(); - while (nChild>0) { - Node child = children.get(nChild-1); - subTree.append(" "); - subTree.append(child.toTree()); - nChild--; - } - - subTree.append(">"); - - return subTree.toString(); - } - else { - // leaf - return this.toString(); - } - } + + com.teragrep.pth10.ast.bo.Token token; + List children; + + public Node() { + ; // disconnected ones for lists etc + } + + public Node(com.teragrep.pth10.ast.bo.Token token) { + this.token = token; + } + + // Convenience API to get token base type + public Token.Type getNodeType() { + return token.getType(); + } + + public void addChild(Node child) { + if (this.children == null) { + this.children = new ArrayList(); + } + this.children.add(child); + } + + public List getChildren() { + return this.children; + } + + public String toString() { + if (this.token != null) { + return this.token.toString(); + } + else { + return "null"; + } + } + + public String toTree() { + if (children != null && children.size() > 0) { + // recurse children + StringBuilder subTree = new StringBuilder(); + + subTree.append("{"); + subTree.append(this.toString()); + subTree.append(" "); + + // their + int nChild = this.children.size(); + while (nChild > 0) { + Node child = children.get(nChild - 1); + subTree.append(" "); + subTree.append(child.toTree()); + nChild--; + } + + subTree.append("}"); + + return subTree.toString(); + } + else { + // leaf + return this.toString(); + } + } + + public String toXMLTree() { + if (children != null && children.size() > 0) { + // recurse children + StringBuilder subTree = new StringBuilder(); + + subTree.append("<"); + subTree.append(this.toString()); + subTree.append(" "); + + // their + int nChild = this.children.size(); + while (nChild > 0) { + Node child = children.get(nChild - 1); + subTree.append(" "); + subTree.append(child.toTree()); + nChild--; + } + + subTree.append(">"); + + return subTree.toString(); + } + else { + // leaf + return this.toString(); + } + } } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/NullNode.java b/src/main/java/com/teragrep/pth10/ast/bo/NullNode.java index 54f1f67..c7e4849 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/NullNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/NullNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -49,6 +49,7 @@ * When a visitor function does not return anything, use this instead of returning null. */ public class NullNode extends Node { + public NullNode() { super(); } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/StepListNode.java b/src/main/java/com/teragrep/pth10/ast/bo/StepListNode.java index 7b21f44..fdb88d3 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/StepListNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/StepListNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; import com.teragrep.pth10.steps.AbstractStep; @@ -55,6 +54,7 @@ * Node for {@literal List} */ public class StepListNode extends Node { + private List list = new ArrayList<>(); public StepListNode(Token token) { @@ -62,14 +62,14 @@ public StepListNode(Token token) { } public StepListNode(List lst) { - list=lst; + list = lst; } public StepListNode() { super(); } - public List asList(){ + public List asList() { return list; } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/StepNode.java b/src/main/java/com/teragrep/pth10/ast/bo/StepNode.java index 7a0e1cb..6cb5dce 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/StepNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/StepNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -50,6 +50,7 @@ public class StepNode extends Node { private AbstractStep step = null; + public StepNode(Token token) { super(token); } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/StringListNode.java b/src/main/java/com/teragrep/pth10/ast/bo/StringListNode.java index 9b13b2d..511406d 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/StringListNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/StringListNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; import java.util.ArrayList; @@ -53,34 +52,35 @@ * Node for {@literal List} */ public class StringListNode extends Node { - private List list = new ArrayList<>(); - public StringListNode(Token token) { - super(token); - list.add(token.value); - } + private List list = new ArrayList<>(); + + public StringListNode(Token token) { + super(token); + list.add(token.value); + } - public StringListNode(List lst) { - list=lst; - } + public StringListNode(List lst) { + list = lst; + } - public StringListNode() { - super(); - } + public StringListNode() { + super(); + } - public void add(String s){ - list.add(s); - } + public void add(String s) { + list.add(s); + } - public void add(int i, String s){ - list.add(i,s); - } + public void add(int i, String s) { + list.add(i, s); + } - public String get(int i){ - return list.get(i); - } + public String get(int i) { + return list.get(i); + } - public List asList(){ - return list; - } + public List asList() { + return list; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/StringNode.java b/src/main/java/com/teragrep/pth10/ast/bo/StringNode.java index e68fa79..4b43422 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/StringNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/StringNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,14 +43,14 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; /** * Node for Strings */ public class StringNode extends Node { - public StringNode(Token token) { - super(token); - } + + public StringNode(Token token) { + super(token); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/SubSearchNode.java b/src/main/java/com/teragrep/pth10/ast/bo/SubSearchNode.java index 31d6329..feafc28 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/SubSearchNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/SubSearchNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; import org.apache.spark.sql.Column; @@ -63,88 +62,86 @@ * Node to contain information of subsearches */ public class SubSearchNode extends ColumnNode { - private static final Logger LOGGER = LoggerFactory.getLogger(SubSearchNode.class); - - private List valList = new ArrayList<>(); - - /** - * Empty constructor, values needs to be filled up. - */ - public SubSearchNode() { - super.val = null; - } - - public SubSearchNode(Token token) - { - super(token); - } - - public SubSearchNode(Column col) { - super.val = col; - } - - public SubSearchNode(String str) { - valList.add(str); - val=new Column("_raw").like("%"+str+"%"); - } - - - public Column getColumn(){ - return val; - } - - public String getStrVal(){ - return valList.stream().collect(Collectors.joining(",")); - } - - public String toString() { - String str = val.expr().sql(); - return str; - } - - public Expression asExpression(){ - return val.expr(); - } - - public void addValue(String valStr){ - valList.add(valStr); -// valStr="%"+valStr+"%"; - valStr="(?i)^*"+valStr+"*"; - if(this.val == null){ - this.val = new Column("_raw").rlike(valStr); - } else { - this.val = this.val.and(new Column("_raw").rlike(valStr)); - } - LOGGER.info("subSearchNode current val: <{}>", val.expr().sql()); - } - - - public Element asElement(Document d) - { - Element el = d.createElement("indexstatement"); - el.setAttribute("OPERATION", "EQUALS"); - el.setAttribute("value", "%" + valList.get(0) + "%"); - LOGGER.info("Construct archiveQuery: <{}>", ElementNode.toString(el)); - if(valList.size()>1) { - for(int i=1;i", ElementNode.toString(el)); - Element andE = d.createElement("AND"); - andE.appendChild(el); - andE.appendChild(e); - el=andE; - } - } - LOGGER.info("SubNode=<{}>", new ElementNode(el)); - return el; - } - - public Dataset asDataset(Datasetds){ - Datasetrv = null; - if(ds != null) - rv = ds.where(val); - return rv; - } + + private static final Logger LOGGER = LoggerFactory.getLogger(SubSearchNode.class); + + private List valList = new ArrayList<>(); + + /** + * Empty constructor, values needs to be filled up. + */ + public SubSearchNode() { + super.val = null; + } + + public SubSearchNode(Token token) { + super(token); + } + + public SubSearchNode(Column col) { + super.val = col; + } + + public SubSearchNode(String str) { + valList.add(str); + val = new Column("_raw").like("%" + str + "%"); + } + + public Column getColumn() { + return val; + } + + public String getStrVal() { + return valList.stream().collect(Collectors.joining(",")); + } + + public String toString() { + String str = val.expr().sql(); + return str; + } + + public Expression asExpression() { + return val.expr(); + } + + public void addValue(String valStr) { + valList.add(valStr); + // valStr="%"+valStr+"%"; + valStr = "(?i)^*" + valStr + "*"; + if (this.val == null) { + this.val = new Column("_raw").rlike(valStr); + } + else { + this.val = this.val.and(new Column("_raw").rlike(valStr)); + } + LOGGER.info("subSearchNode current val: <{}>", val.expr().sql()); + } + + public Element asElement(Document d) { + Element el = d.createElement("indexstatement"); + el.setAttribute("OPERATION", "EQUALS"); + el.setAttribute("value", "%" + valList.get(0) + "%"); + LOGGER.info("Construct archiveQuery: <{}>", ElementNode.toString(el)); + if (valList.size() > 1) { + for (int i = 1; i < valList.size(); i++) { + Element e = d.createElement("indexstatement"); + e.setAttribute("OPERATION", "EQUALS"); + e.setAttribute("value", "%" + valList.get(i) + "%"); + LOGGER.info("Construct archiveQuery: <{}>", ElementNode.toString(el)); + Element andE = d.createElement("AND"); + andE.appendChild(el); + andE.appendChild(e); + el = andE; + } + } + LOGGER.info("SubNode=<{}>", new ElementNode(el)); + return el; + } + + public Dataset asDataset(Dataset ds) { + Dataset rv = null; + if (ds != null) + rv = ds.where(val); + return rv; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/Token.java b/src/main/java/com/teragrep/pth10/ast/bo/Token.java index 662b809..66128a7 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/Token.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/Token.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,61 +43,61 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.bo; /** * Token class to declare Token with certain types and values */ public class Token { - public enum Type { - AND, - OR, - INVALID, - EQUALS, - NOT_EQUALS, - LIKE, - NOT_LIKE, - GT, - GE, - LT, - LE, - DIRECTORY, - MATCH, - LOGICAL_STATEMENT, - INDEX_STATEMENT, - SEARCH_QUALIFIER, - SUB_LOGICAL_STATEMENT, - TIMEFORMAT_STATEMENT, - TRANSFORM_STATEMENT, - STRING, - STRINGLIST, - STEP, - IF_STATEMENT - } - Type type; - String value; + public enum Type { + AND, + OR, + INVALID, + EQUALS, + NOT_EQUALS, + LIKE, + NOT_LIKE, + GT, + GE, + LT, + LE, + DIRECTORY, + MATCH, + LOGICAL_STATEMENT, + INDEX_STATEMENT, + SEARCH_QUALIFIER, + SUB_LOGICAL_STATEMENT, + TIMEFORMAT_STATEMENT, + TRANSFORM_STATEMENT, + STRING, + STRINGLIST, + STEP, + IF_STATEMENT + } + + Type type; + String value; - public Token(Type type, String value) { - this.type = type; - this.value = value; - } + public Token(Type type, String value) { + this.type = type; + this.value = value; + } - public Token(Type type) { - this.type = type; - } + public Token(Type type) { + this.type = type; + } - public Type getType() { - return this.type; - } + public Type getType() { + return this.type; + } - public String toString() { - if (this.value != null) { - return this.value; - } - else { - return this.type.toString(); - } - } + public String toString() { + if (this.value != null) { + return this.value; + } + else { + return this.type.toString(); + } + } } diff --git a/src/main/java/com/teragrep/pth10/ast/bo/TranslationResultNode.java b/src/main/java/com/teragrep/pth10/ast/bo/TranslationResultNode.java index 01d48e9..688e6c6 100644 --- a/src/main/java/com/teragrep/pth10/ast/bo/TranslationResultNode.java +++ b/src/main/java/com/teragrep/pth10/ast/bo/TranslationResultNode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -48,6 +48,7 @@ import com.teragrep.pth10.ast.StepList; public class TranslationResultNode extends Node { + public final StepList stepList; public TranslationResultNode(final StepList sl) { diff --git a/src/main/java/com/teragrep/pth10/ast/commands/EmitMode.java b/src/main/java/com/teragrep/pth10/ast/commands/EmitMode.java index 2f805ed..a5767af 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/EmitMode.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/EmitMode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,14 +43,20 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands; /** - *

Used to sent the mode for the translation to work in; XML or CATALYST.

- *

XML is only used in logicalStatement for the archive query, otherwise - * everything should be processed in CATALYST (Spark) mode.

+ *

+ * Used to sent the mode for the translation to work in; XML or CATALYST. + *

+ *

+ * XML is only used in logicalStatement for the archive query, otherwise everything should be processed in + * CATALYST (Spark) mode. + *

*/ public class EmitMode { - public enum mode {XML,CATALYST} + + public enum mode { + XML, CATALYST + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/AggregateFunction.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/AggregateFunction.java index b733f20..7d1c5b3 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/AggregateFunction.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/AggregateFunction.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -66,18 +65,18 @@ import org.slf4j.LoggerFactory; /** - * Processes any aggregate functions used for example by the stats command. - * The aggregation function is returned as a column, which will be applied - * to the desired dataset using the Dataset.agg() method. + * Processes any aggregate functions used for example by the stats command. The aggregation function is returned as a + * column, which will be applied to the desired dataset using the Dataset.agg() method. */ public class AggregateFunction extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(AggregateFunction.class); private String aggregateField = null; private final DPLParserCatalystContext catCtx; /** - * Constructor for the aggregate function, used to initialize - * the class inside aggregating commands like statsTransformation. + * Constructor for the aggregate function, used to initialize the class inside aggregating commands like + * statsTransformation. */ public AggregateFunction(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; @@ -87,10 +86,10 @@ public String getAggregateField() { return this.aggregateField; } - /** * Visit aggregate function
* func( arg ) + * * @param ctx AggregationFunctionContext * @return Node containing the column needed for processing the aggregation */ @@ -103,15 +102,14 @@ public Node AggregateFunctionEmitCatalyst(DPLParser.AggregateFunctionContext ctx } /** - * -- Aggregate method: Count -- - * Uses the built-in Spark function count(). + * -- Aggregate method: Count -- Uses the built-in Spark function count(). */ @Override public Node visitAggregateMethodCount(DPLParser.AggregateMethodCountContext ctx) { Node rv = aggregateMethodCountEmitCatalyst(ctx); // Default fieldname - aggregateField ="count"; + aggregateField = "count"; return rv; } @@ -134,15 +132,14 @@ public Node aggregateMethodCountEmitCatalyst(DPLParser.AggregateMethodCountConte } /** - * -- Aggregate method: Sum -- - * Uses the built-in Spark function sum(). + * -- Aggregate method: Sum -- Uses the built-in Spark function sum(). */ @Override public Node visitAggregateMethodSum(DPLParser.AggregateMethodSumContext ctx) { Node rv = aggregateMethodSumEmitCatalyst(ctx); // Default fieldname - aggregateField ="sum"; + aggregateField = "sum"; return rv; } @@ -156,7 +153,8 @@ public Node aggregateMethodSumEmitCatalyst(DPLParser.AggregateMethodSumContext c if (columnPt != null) { col = new SumAggregator(new FieldIndexImpl(columnPt.getText()), catCtx.nullValue).toColumn(); resultColumnName = String.format("sum(%s)", columnPt.getText()); - } else { + } + else { col = new SumAggregator(new FieldIndexStub(), catCtx.nullValue).toColumn(); } @@ -165,15 +163,15 @@ public Node aggregateMethodSumEmitCatalyst(DPLParser.AggregateMethodSumContext c } /** - * -- Aggregate method: Median -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ExactPercentileAggregator} - * with percentile = 0.5d to calculate the median. + * -- Aggregate method: Median -- Uses the + * {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ExactPercentileAggregator} with + * percentile = 0.5d to calculate the median. */ public Node visitAggregateMethodMedian(DPLParser.AggregateMethodMedianContext ctx) { Node rv = aggregateMethodMedianEmitCatalyst(ctx); // Default fieldname - aggregateField ="median"; + aggregateField = "median"; return rv; } @@ -192,20 +190,24 @@ public Node aggregateMethodMedianEmitCatalyst(DPLParser.AggregateMethodMedianCon } /** - * -- Aggregate method: EstimatedDistinctCount_error -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.UDAF_DistinctCount} with built-in spark - * functions approx_count_distinct(), abs() and divide(). + * -- Aggregate method: EstimatedDistinctCount_error -- Uses the + * {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.UDAF_DistinctCount} with built-in spark functions + * approx_count_distinct(), abs() and divide(). *
estdc_error = abs(estimate_distinct_count - real_distinct_count)/real_distinct_count
*/ - public Node visitAggregateMethodEstimatedDistinctErrorCount(DPLParser.AggregateMethodEstimatedDistinctErrorCountContext ctx) { + public Node visitAggregateMethodEstimatedDistinctErrorCount( + DPLParser.AggregateMethodEstimatedDistinctErrorCountContext ctx + ) { Node rv = aggregateMethodEstimatedDistinctErrorCountEmitCatalyst(ctx); // Default fieldname - aggregateField ="estdc"; + aggregateField = "estdc"; return rv; } - public Node aggregateMethodEstimatedDistinctErrorCountEmitCatalyst(DPLParser.AggregateMethodEstimatedDistinctErrorCountContext ctx) { + public Node aggregateMethodEstimatedDistinctErrorCountEmitCatalyst( + DPLParser.AggregateMethodEstimatedDistinctErrorCountContext ctx + ) { Node rv = null; String arg = ctx.getChild(1).getText(); @@ -236,10 +238,8 @@ public Node aggregateMethodEstimatedDistinctErrorCountEmitCatalyst(DPLParser.Agg } /** - * -- Aggregate method: Range -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.MinMaxAggregator} in - * RANGE mode to perform the aggregation. - *
range(x) = max(x) - min(x)
+ * -- Aggregate method: Range -- Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.MinMaxAggregator} + * in RANGE mode to perform the aggregation.
range(x) = max(x) - min(x)
*/ public Node visitAggregateMethodRange(DPLParser.AggregateMethodRangeContext ctx) { Node rv = aggregateMethodRangeEmitCatalyst(ctx); @@ -264,15 +264,13 @@ public Node aggregateMethodRangeEmitCatalyst(DPLParser.AggregateMethodRangeConte } /** - * -- Aggregate method: SumSquare -- - * Uses the built-in Spark functions pow() and sum() to calculate first the square and then the sum - * to form the sum of squares. - *
sumSq = x0^2 + x1^2 + ... + xn^2
+ * -- Aggregate method: SumSquare -- Uses the built-in Spark functions pow() and sum() to calculate first the square + * and then the sum to form the sum of squares.
sumSq = x0^2 + x1^2 + ... + xn^2
*/ public Node visitAggregateMethodSumSquare(DPLParser.AggregateMethodSumSquareContext ctx) { Node rv = aggregateMethodSumSquareEmitCatalyst(ctx); - aggregateField ="sumsq"; + aggregateField = "sumsq"; return rv; } @@ -293,13 +291,13 @@ public Node aggregateMethodSumSquareEmitCatalyst(DPLParser.AggregateMethodSumSqu } /** - * -- Aggregate method: DistinctCount -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.DistinctCountAggregator} to perform the aggregation. + * -- Aggregate method: DistinctCount -- Uses the + * {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.DistinctCountAggregator} to perform the aggregation. */ public Node visitAggregateMethodDistinctCount(DPLParser.AggregateMethodDistinctCountContext ctx) { Node rv = aggregateMethodDistinctCountEmitCatalyst(ctx); - aggregateField ="dc"; + aggregateField = "dc"; return rv; } @@ -319,17 +317,18 @@ public Node aggregateMethodDistinctCountEmitCatalyst(DPLParser.AggregateMethodDi } /** - * -- Aggregate method: EstimatedDistinctCount -- - * Uses the built-in Spark function approx_count_distinct() + * -- Aggregate method: EstimatedDistinctCount -- Uses the built-in Spark function approx_count_distinct() */ public Node visitAggregateMethodEstimatedDistinctCount(DPLParser.AggregateMethodEstimatedDistinctCountContext ctx) { Node rv = aggregateMethodEstimatedDistinctCountEmitCatalyst(ctx); - aggregateField ="estdc"; + aggregateField = "estdc"; return rv; } - public Node aggregateMethodEstimatedDistinctCountEmitCatalyst(DPLParser.AggregateMethodEstimatedDistinctCountContext ctx) { + public Node aggregateMethodEstimatedDistinctCountEmitCatalyst( + DPLParser.AggregateMethodEstimatedDistinctCountContext ctx + ) { Node rv = null; String arg = ctx.getChild(1).getText(); @@ -343,14 +342,13 @@ public Node aggregateMethodEstimatedDistinctCountEmitCatalyst(DPLParser.Aggregat } /** - * -- Aggregate method: Max -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.MinMaxAggregator} in + * -- Aggregate method: Max -- Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.MinMaxAggregator} in * MAX mode to perform the aggregation. */ public Node visitAggregateMethodMax(DPLParser.AggregateMethodMaxContext ctx) { Node rv = aggregateMethodMaxEmitCatalyst(ctx); - aggregateField ="max"; + aggregateField = "max"; return rv; } @@ -368,14 +366,13 @@ public Node aggregateMethodMaxEmitCatalyst(DPLParser.AggregateMethodMaxContext c } /** - * -- Aggregate method: Min -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.MinMaxAggregator} in + * -- Aggregate method: Min -- Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.MinMaxAggregator} in * MIN mode to perform the aggregation. */ public Node visitAggregateMethodMin(DPLParser.AggregateMethodMinContext ctx) { Node rv = aggregateMethodMinEmitCatalyst(ctx); - aggregateField ="min"; + aggregateField = "min"; return rv; } @@ -393,13 +390,12 @@ public Node aggregateMethodMinEmitCatalyst(DPLParser.AggregateMethodMinContext c } /** - * -- Aggregate method: Variance -- - * Uses the built-in Spark functions var_samp() and var_pop() + * -- Aggregate method: Variance -- Uses the built-in Spark functions var_samp() and var_pop() */ public Node visitAggregateMethodVariance(DPLParser.AggregateMethodVarianceContext ctx) { Node rv = aggregateMethodVarianceEmitCatalyst(ctx); - aggregateField ="var"; + aggregateField = "var"; return rv; } @@ -430,13 +426,12 @@ public Node aggregateMethodVarianceEmitCatalyst(DPLParser.AggregateMethodVarianc } /** - * -- Aggregate method: Standard Deviation -- - * Uses the built-in Spark function for stddev_samp() and stddev_pop() + * -- Aggregate method: Standard Deviation -- Uses the built-in Spark function for stddev_samp() and stddev_pop() */ public Node visitAggregateMethodStandardDeviation(DPLParser.AggregateMethodStandardDeviationContext ctx) { Node rv = aggregateMethodStandardDeviationEmitCatalyst(ctx); - aggregateField ="stdev"; + aggregateField = "stdev"; return rv; } @@ -465,8 +460,7 @@ public Node aggregateMethodStandardDeviationEmitCatalyst(DPLParser.AggregateMeth } /** - * -- Aggregate method: Avg -- - * Uses the built-in Spark function for avg(). + * -- Aggregate method: Avg -- Uses the built-in Spark function for avg(). */ public Node visitAggregateMethodAvg(DPLParser.AggregateMethodAvgContext ctx) { Node rv = aggregateMethodAvgEmitCatalyst(ctx); @@ -500,9 +494,8 @@ public Node aggregateMethodAvgEmitCatalyst(DPLParser.AggregateMethodAvgContext c } /** - * -- Aggregate method: Mode -- - * Returns the field value with most occurrences within a given column. - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ModeAggregator} to perform the aggregation. + * -- Aggregate method: Mode -- Returns the field value with most occurrences within a given column. Uses the + * {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ModeAggregator} to perform the aggregation. */ public Node visitAggregateMethodMode(DPLParser.AggregateMethodModeContext ctx) { Node rv = aggregateMethodModeEmitCatalyst(ctx); @@ -545,7 +538,7 @@ public Node aggregateMethodFirstEmitCatalyst(DPLParser.AggregateMethodFirstConte public Node visitAggregateMethodLast(DPLParser.AggregateMethodLastContext ctx) { Node rv = aggregateMethodLastEmitCatalyst(ctx); - aggregateField ="last"; + aggregateField = "last"; return rv; } @@ -556,15 +549,14 @@ public Node aggregateMethodLastEmitCatalyst(DPLParser.AggregateMethodLastContext } /** - * -- Aggregate method: Earliest -- - * Returns the row with the earliest timestamp - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.EarliestLatestAggregator_String} in - * EARLIEST mode to perform the aggregation. + * -- Aggregate method: Earliest -- Returns the row with the earliest timestamp Uses the + * {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.EarliestLatestAggregator_String} in EARLIEST + * mode to perform the aggregation. */ public Node visitAggregateMethodEarliest(DPLParser.AggregateMethodEarliestContext ctx) { Node rv = aggregateMethodEarliestEmitCatalyst(ctx); - aggregateField ="earliest"; + aggregateField = "earliest"; return rv; } @@ -581,8 +573,8 @@ public Node aggregateMethodEarliestEmitCatalyst(DPLParser.AggregateMethodEarlies } /** - * -- Aggregation method: Earliest_time -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.EarliestLatestAggregator_String} in + * -- Aggregation method: Earliest_time -- Uses the + * {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.EarliestLatestAggregator_String} in * EARLIEST_TIME mode to perform the aggregation. */ public Node visitAggregateMethodEarliestTime(DPLParser.AggregateMethodEarliestTimeContext ctx) { @@ -599,15 +591,18 @@ public Node aggregateMethodEarliestTimeEmitCatalyst(DPLParser.AggregateMethodEar String columnName = ctx.getChild(1).getText(); String resultColumnName = String.format("earliest_time(%s)", columnName); - Column asUnixTime = new EarliestLatestAggregator_String(columnName, AggregatorMode.EarliestLatestAggregatorMode.EARLIEST_TIME).toColumn(); + Column asUnixTime = new EarliestLatestAggregator_String( + columnName, + AggregatorMode.EarliestLatestAggregatorMode.EARLIEST_TIME + ).toColumn(); return new ColumnNode(asUnixTime.as(resultColumnName)); } /** - * -- Aggregation method: Latest -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.EarliestLatestAggregator_String} in - * LATEST mode to perform the aggregation. + * -- Aggregation method: Latest -- Uses the + * {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.EarliestLatestAggregator_String} in LATEST + * mode to perform the aggregation. */ public Node visitAggregateMethodLatest(DPLParser.AggregateMethodLatestContext ctx) { Node rv = aggregateMethodLatestEmitCatalyst(ctx); @@ -623,20 +618,21 @@ public Node aggregateMethodLatestEmitCatalyst(DPLParser.AggregateMethodLatestCon String resultColumnName = String.format("latest(%s)", colName); // use aggregator - Column col = new EarliestLatestAggregator_String(colName, AggregatorMode.EarliestLatestAggregatorMode.LATEST).toColumn(); + Column col = new EarliestLatestAggregator_String(colName, AggregatorMode.EarliestLatestAggregatorMode.LATEST) + .toColumn(); return new ColumnNode(col.as(resultColumnName)); } /** - * -- Aggregation method: Latest_time -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.EarliestLatestAggregator_String} in + * -- Aggregation method: Latest_time -- Uses the + * {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.EarliestLatestAggregator_String} in * LATEST_TIME mode to perform the aggregation. */ public Node visitAggregateMethodLatestTime(DPLParser.AggregateMethodLatestTimeContext ctx) { Node rv = aggregateMethodLatestTimeEmitCatalyst(ctx); - aggregateField ="latest_time"; + aggregateField = "latest_time"; return rv; } @@ -647,19 +643,21 @@ public Node aggregateMethodLatestTimeEmitCatalyst(DPLParser.AggregateMethodLates String colName = ctx.getChild(1).getText(); String resultColumnName = String.format("latest_time(%s)", colName); - Column asUnixTime = new EarliestLatestAggregator_String(colName, AggregatorMode.EarliestLatestAggregatorMode.LATEST_TIME).toColumn(); + Column asUnixTime = new EarliestLatestAggregator_String( + colName, + AggregatorMode.EarliestLatestAggregatorMode.LATEST_TIME + ).toColumn(); return new ColumnNode(asUnixTime.as(resultColumnName)); } /** - * -- Aggregate method: List -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ValuesAggregator} in LIST mode - * to perform the aggregation. + * -- Aggregate method: List -- Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ValuesAggregator} in + * LIST mode to perform the aggregation. */ public Node visitAggregateMethodList(DPLParser.AggregateMethodListContext ctx) { Node rv = aggregateMethodListEmitCatalyst(ctx); - aggregateField ="list"; + aggregateField = "list"; return rv; } @@ -681,13 +679,13 @@ public Node aggregateMethodListEmitCatalyst(DPLParser.AggregateMethodListContext } /** - * -- Aggregate method: Values -- - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ValuesAggregator} to perform the aggregation + * -- Aggregate method: Values -- Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ValuesAggregator} + * to perform the aggregation */ public Node visitAggregateMethodValues(DPLParser.AggregateMethodValuesContext ctx) { Node rv = aggregateMethodValuesEmitCatalyst(ctx); - aggregateField ="values"; + aggregateField = "values"; return rv; } @@ -706,68 +704,69 @@ public Node aggregateMethodValuesEmitCatalyst(DPLParser.AggregateMethodValuesCon } /** - * -- Aggregate method: Percentile -- - * Can calculate the percentile multiple different ways, - * exactperc(), upperperc() and perc(). - * Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ExactPercentileAggregator} to perform exactperc(), - * and {@link com.teragrep.pth10.ast.commands.aggregate.utils.PercentileApprox} for upperperc() and perc(). + * -- Aggregate method: Percentile -- Can calculate the percentile multiple different ways, exactperc(), upperperc() + * and perc(). Uses the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ExactPercentileAggregator} to perform + * exactperc(), and {@link com.teragrep.pth10.ast.commands.aggregate.utils.PercentileApprox} for upperperc() and + * perc(). */ public Node visitAggregateMethodPercentileVariable(DPLParser.AggregateMethodPercentileVariableContext ctx) { - LOGGER.debug("Visiting percX(Y)"); - Node rv = aggregateMethodPercentileVariableEmitCatalyst(ctx); + LOGGER.debug("Visiting percX(Y)"); + Node rv = aggregateMethodPercentileVariableEmitCatalyst(ctx); - aggregateField ="percentile"; + aggregateField = "percentile"; return rv; } // TODO Implement upperperc public Node aggregateMethodPercentileVariableEmitCatalyst(DPLParser.AggregateMethodPercentileVariableContext ctx) { - TerminalNode func = (TerminalNode) ctx.getChild(0); - String commandName = func.getText(); - String colName = ctx.getChild(2).getText(); - Column col = null; - - // Make sure the result column name matches the DPL command - String resultColumnName = String.format("%s(%s)", commandName, colName); - - LOGGER.debug("Command: <[{}]>", commandName); - - // There are four different options for func: pX, percX, exactpercX, upperpercX - // This separates the X from the command name into its own variable xThPercentileArg. - String funcAsString = func.getText(); - - double xThPercentileArg = - (funcAsString.length() <= 4) ? - Double.valueOf(funcAsString.substring(funcAsString.indexOf('p') + 1)) : /* pX */ - Double.valueOf(funcAsString.substring(funcAsString.lastIndexOf('c') + 1)); /* percX, exactpercX, upperpercX */ - - LOGGER.debug("perc: Use percentile = <[{}]>", xThPercentileArg); - - switch (func.getSymbol().getType()) { - case DPLLexer.METHOD_AGGREGATE_P_VARIABLE: - case DPLLexer.METHOD_AGGREGATE_PERC_VARIABLE: { - col = new PercentileApprox().percentile_approx(functions.col(colName), functions.lit(xThPercentileArg/100)); - break; - } - case DPLLexer.METHOD_AGGREGATE_EXACTPERC_VARIABLE: { - col = new ExactPercentileAggregator(colName, xThPercentileArg/100).toColumn(); - break; - } - case DPLLexer.METHOD_AGGREGATE_UPPERPERC_VARIABLE: { - // upperperc() returns the same as perc() if under 1000 values - // TODO over 1000 distinct values - col = new PercentileApprox().percentile_approx(functions.col(colName), functions.lit(xThPercentileArg/100)); - //throw new UnsupportedOperationException("Upper percentile mode not supported yet"); - break; - } - } - - return new ColumnNode(col.as(resultColumnName)); + TerminalNode func = (TerminalNode) ctx.getChild(0); + String commandName = func.getText(); + String colName = ctx.getChild(2).getText(); + Column col = null; + + // Make sure the result column name matches the DPL command + String resultColumnName = String.format("%s(%s)", commandName, colName); + + LOGGER.debug("Command: <[{}]>", commandName); + + // There are four different options for func: pX, percX, exactpercX, upperpercX + // This separates the X from the command name into its own variable xThPercentileArg. + String funcAsString = func.getText(); + + double xThPercentileArg = (funcAsString.length() <= 4) ? Double + .valueOf(funcAsString.substring(funcAsString.indexOf('p') + 1)) : /* pX */ + Double + .valueOf(funcAsString.substring(funcAsString.lastIndexOf('c') + 1)); /* percX, exactpercX, upperpercX */ + + LOGGER.debug("perc: Use percentile = <[{}]>", xThPercentileArg); + + switch (func.getSymbol().getType()) { + case DPLLexer.METHOD_AGGREGATE_P_VARIABLE: + case DPLLexer.METHOD_AGGREGATE_PERC_VARIABLE: { + col = new PercentileApprox() + .percentile_approx(functions.col(colName), functions.lit(xThPercentileArg / 100)); + break; + } + case DPLLexer.METHOD_AGGREGATE_EXACTPERC_VARIABLE: { + col = new ExactPercentileAggregator(colName, xThPercentileArg / 100).toColumn(); + break; + } + case DPLLexer.METHOD_AGGREGATE_UPPERPERC_VARIABLE: { + // upperperc() returns the same as perc() if under 1000 values + // TODO over 1000 distinct values + col = new PercentileApprox() + .percentile_approx(functions.col(colName), functions.lit(xThPercentileArg / 100)); + //throw new UnsupportedOperationException("Upper percentile mode not supported yet"); + break; + } + } + + return new ColumnNode(col.as(resultColumnName)); } /** - * -- Aggregate method: rate -- - * Represents
(latest(X) - earliest(X)) / (latest_time(X) - earliest_time(X))
+ * -- Aggregate method: rate -- Represents + *
(latest(X) - earliest(X)) / (latest_time(X) - earliest_time(X))
*/ public Node visitAggregateMethodRate(DPLParser.AggregateMethodRateContext ctx) { Node rv = aggregateMethodRateEmitCatalyst(ctx); @@ -782,8 +781,9 @@ public Node aggregateMethodRateEmitCatalyst(DPLParser.AggregateMethodRateContext // Make sure the result column name matches the DPL command String resultColumnName = String.format("rate(%s)", colName); - Column res = new EarliestLatestAggregator_Double(colName, AggregatorMode.EarliestLatestAggregatorMode.RATE).toColumn(); + Column res = new EarliestLatestAggregator_Double(colName, AggregatorMode.EarliestLatestAggregatorMode.RATE) + .toColumn(); return new ColumnNode(res.as(resultColumnName)); } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/AggregatorMode.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/AggregatorMode.java index d1fc3aa..ebc1bbc 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/AggregatorMode.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/AggregatorMode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,36 +43,38 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; /** - * Collection of enumerables to change the mode of certain Aggregators - * (Some Aggregators can produce different results based on the mode it is operating in) + * Collection of enumerables to change the mode of certain Aggregators (Some Aggregators can produce different results + * based on the mode it is operating in) + * * @author eemhu - * */ public class AggregatorMode { - - /** Used for ValuesAggregator
- * ValuesAggregator can produce the results for commands list() and values() - */ - public enum ValuesAggregatorMode { - LIST, VALUES - } - - /** Used for EarliestLatestAggregator
- * EarliestLatestAggregator can produce the results for commands earliest(),
- * latest(), earliest_time(), latest_time(), rate() - */ - public enum EarliestLatestAggregatorMode { - EARLIEST, LATEST, EARLIEST_TIME, LATEST_TIME, RATE - } - /** Used for MinMaxAggregator
- * min(), max(), range() - */ - public enum MinMaxAggregatorMode { - MIN, MAX, RANGE - } + /** + * Used for ValuesAggregator
+ * ValuesAggregator can produce the results for commands list() and values() + */ + public enum ValuesAggregatorMode { + LIST, VALUES + } + + /** + * Used for EarliestLatestAggregator
+ * EarliestLatestAggregator can produce the results for commands earliest(),
+ * latest(), earliest_time(), latest_time(), rate() + */ + public enum EarliestLatestAggregatorMode { + EARLIEST, LATEST, EARLIEST_TIME, LATEST_TIME, RATE + } + + /** + * Used for MinMaxAggregator
+ * min(), max(), range() + */ + public enum MinMaxAggregatorMode { + MIN, MAX, RANGE + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/CountBuffer.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/CountBuffer.java index 3f19701..fc33286 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/CountBuffer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/CountBuffer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,57 +43,60 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses; import java.io.Serializable; import java.util.Map; /** - * Java Bean compliant class to enclose the map with helper methods - * used in DistinctCountAggregator + * Java Bean compliant class to enclose the map with helper methods used in DistinctCountAggregator + * * @author eemhu - * */ public class CountBuffer extends MapBuffer implements Serializable { - private static final long serialVersionUID = 1L; - - // Helper methods - /** - * Merges the map in the buffer with another. - * @param another map to merge with - */ - public void mergeMap(Map another) { - another.forEach((key, value) -> { - this.map.merge(key, value, (v1, v2) -> { - // This gets called for possible duplicates - // In that case, add the values together - return v1 + v2; - }); - }); - } + private static final long serialVersionUID = 1L; + + // Helper methods + + /** + * Merges the map in the buffer with another. + * + * @param another map to merge with + */ + public void mergeMap(Map another) { + another.forEach((key, value) -> { + this.map + .merge(key, value, (v1, v2) -> { + // This gets called for possible duplicates + // In that case, add the values together + return v1 + v2; + }); + }); + } + + /** + * Adds data to the map + * + * @param data string to add + */ + public void add(String data) { + if (this.map.containsKey(data)) { + Long currentValue = this.map.get(data); + this.map.put(data, currentValue + 1L); + } + else { + this.map.put(data, 1L); + } + } - /** - * Adds data to the map - * @param data string to add - */ - public void add(String data) { - if (this.map.containsKey(data)) { - Long currentValue = this.map.get(data); - this.map.put(data, currentValue + 1L); - } - else { - this.map.put(data, 1L); - } - } + /** + * Returns the distinct count of the items in the buffer + * + * @return distinct count as an integer + */ + public Integer dc() { + return this.map.size(); + } - /** - * Returns the distinct count of the items in the buffer - * @return distinct count as an integer - */ - public Integer dc() { - return this.map.size(); - } - -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ListBuffer.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ListBuffer.java index 53cef55..7123a52 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ListBuffer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ListBuffer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses; import java.io.Serializable; @@ -52,70 +51,76 @@ /** * An abstract class to be List-based aggregation buffers on + * * @author eemhu - * * @param Type of data in list */ public abstract class ListBuffer implements Serializable { - private static final long serialVersionUID = 1L; - protected List list; - - // Required constructors & methods for Java Bean compliance + private static final long serialVersionUID = 1L; + protected List list; + + // Required constructors & methods for Java Bean compliance - /** - * Initialize a ListBuffer with an arraylist of type T - */ - public ListBuffer() { - this.list = new ArrayList(); - } + /** + * Initialize a ListBuffer with an arraylist of type T + */ + public ListBuffer() { + this.list = new ArrayList(); + } - /** - * Initialize a ListBuffer with an existing list of type T - * @param list existing list to initialize the buffer with - */ - public ListBuffer(List list) { - this.list = list; - } + /** + * Initialize a ListBuffer with an existing list of type T + * + * @param list existing list to initialize the buffer with + */ + public ListBuffer(List list) { + this.list = list; + } - /** - * Gets the internal list from the buffer - * @return list of type T - */ - public List getList() { - return this.list; - } + /** + * Gets the internal list from the buffer + * + * @return list of type T + */ + public List getList() { + return this.list; + } - /** - * Sets the internal list of the buffer - * @param list to set it to - */ - public void setList(List list) { - this.list = list; - } + /** + * Sets the internal list of the buffer + * + * @param list to set it to + */ + public void setList(List list) { + this.list = list; + } - /** - * Gets the size of the internal list - * @return size of the list as an integer - */ - public int getSize() { - return this.list.size(); - } + /** + * Gets the size of the internal list + * + * @return size of the list as an integer + */ + public int getSize() { + return this.list.size(); + } - /** - * Abstract method for sorting the internal list - */ - public abstract void sortInternalList(); + /** + * Abstract method for sorting the internal list + */ + public abstract void sortInternalList(); - /** - * Abstract method for merging the internal list with another - * @param another list to merge with - */ - public abstract void mergeList(List another); + /** + * Abstract method for merging the internal list with another + * + * @param another list to merge with + */ + public abstract void mergeList(List another); - /** - * Abstract method for adding data to the buffer - * @param data to add to the buffer - */ - public abstract void add(T data); -} \ No newline at end of file + /** + * Abstract method for adding data to the buffer + * + * @param data to add to the buffer + */ + public abstract void add(T data); +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/MapBuffer.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/MapBuffer.java index 8ce5650..7f05076 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/MapBuffer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/MapBuffer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses; import java.io.Serializable; @@ -52,43 +51,47 @@ /** * An abstract class to base Map-based Aggregation buffers on + * * @author eemhu - * * @param type of key in map * @param type of value in map */ public abstract class MapBuffer implements Serializable { - private static final long serialVersionUID = 1L; - protected Map map; - - // Required constructors & methods for Java Bean compliance - /** - * Initialize a map buffer - */ - public MapBuffer() { - this.map = new HashMap(); - } + private static final long serialVersionUID = 1L; + protected Map map; + + // Required constructors & methods for Java Bean compliance + + /** + * Initialize a map buffer + */ + public MapBuffer() { + this.map = new HashMap(); + } - /** - * Gets the internal map of the buffer - * @return internal map - */ - public Map getMap() { - return this.map; - } + /** + * Gets the internal map of the buffer + * + * @return internal map + */ + public Map getMap() { + return this.map; + } - /** - * Sets the internal map of the buffer - * @param map new internal map - */ - public void setMap(Map map) { - this.map = map; - } + /** + * Sets the internal map of the buffer + * + * @param map new internal map + */ + public void setMap(Map map) { + this.map = map; + } - /** - * Abstract method to merge internal map with another - * @param another map to merge with - */ - public abstract void mergeMap(Map another); + /** + * Abstract method to merge internal map with another + * + * @param another map to merge with + */ + public abstract void mergeMap(Map another); } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/MinMaxBuffer.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/MinMaxBuffer.java index d705f4a..7d104e9 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/MinMaxBuffer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/MinMaxBuffer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses; import org.apache.spark.sql.types.DataTypes; @@ -51,10 +50,10 @@ import java.io.Serializable; /** - * The buffer class that is used for the MinMaxAggregator. - * Must be Java Bean compliant and serializable. + * The buffer class that is used for the MinMaxAggregator. Must be Java Bean compliant and serializable. */ public class MinMaxBuffer implements Serializable { + private static final long serialVersionUID = 1L; protected Double minNumber = null; @@ -104,9 +103,9 @@ public MinMaxBuffer() { } /** - * Gets the outputFormatType of the buffer. - * Can be either DataTypes.DoubleType.typeName() or + * Gets the outputFormatType of the buffer. Can be either DataTypes.DoubleType.typeName() or * DataTypes.IntegerType.typeName() + * * @return output format as a string */ public String getOutputFormatType() { @@ -114,9 +113,9 @@ public String getOutputFormatType() { } /** - * Sets the outputFormatType of the buffer. - * Can be set to DataTypes.DoubleType.typeName() only, - * the default is DataTypes.IntegerType.typeName() + * Sets the outputFormatType of the buffer. Can be set to DataTypes.DoubleType.typeName() only, the + * default is DataTypes.IntegerType.typeName() + * * @param outputFormatType type to set to */ public void setOutputFormatType(String outputFormatType) { @@ -126,8 +125,9 @@ public void setOutputFormatType(String outputFormatType) { } /** - * Checks given number against current max and min values, and sets it as them if it is - * the new minimum and/or maximum + * Checks given number against current max and min values, and sets it as them if it is the new minimum and/or + * maximum + * * @param value to add */ public void addNumber(double value) { @@ -142,8 +142,9 @@ public void addNumber(double value) { } /** - * Checks given string against current max and min values, and sets it as them if it is - * the new minimum and/or maximum + * Checks given string against current max and min values, and sets it as them if it is the new minimum and/or + * maximum + * * @param value to add */ public void addString(String value) { diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ModeBuffer.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ModeBuffer.java index 7f47ff6..9c11ce4 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ModeBuffer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ModeBuffer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,64 +43,68 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses; import java.io.Serializable; import java.util.Map; /** - * Buffer used by the - * {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ModeAggregator#ModeAggregator(String) ModeAggregator} + * Buffer used by the {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ModeAggregator#ModeAggregator(String) + * ModeAggregator} */ public class ModeBuffer extends MapBuffer implements Serializable { - private static final long serialVersionUID = 1L; - /** - * Adds a key with currentValue + 1 if it was already in the map, otherwise 1. - * @param key Key to add - */ - public void add(String key) { - if (this.map.containsKey(key)) { - Long currentValue = this.map.get(key); - this.map.put(key, currentValue + 1L); - } - else { - this.map.put(key, 1L); - } - } + private static final long serialVersionUID = 1L; + + /** + * Adds a key with currentValue + 1 if it was already in the map, otherwise 1. + * + * @param key Key to add + */ + public void add(String key) { + if (this.map.containsKey(key)) { + Long currentValue = this.map.get(key); + this.map.put(key, currentValue + 1L); + } + else { + this.map.put(key, 1L); + } + } + + /** + * Merge internal map with another + * + * @param another map to merge with + */ + public void mergeMap(Map another) { + another.forEach((key, value) -> { + this.map + .merge(key, value, (v1, v2) -> { + // This gets called for possible duplicates + // In that case, add them together + return v1 + v2; + }); + }); + } + + /** + * Returns the most frequent entry in the buffer + * + * @return most frequent entry as a string + */ + public String mode() { + Map.Entry mostFrequentEntry = null; + + for (Map.Entry entry : this.map.entrySet()) { + if (mostFrequentEntry == null) { + mostFrequentEntry = entry; + } + else if (mostFrequentEntry != null && entry.getValue() > mostFrequentEntry.getValue()) { + mostFrequentEntry = entry; + } + } - /** - * Merge internal map with another - * @param another map to merge with - */ - public void mergeMap(Map another) { - another.forEach((key, value) -> { - this.map.merge(key, value, (v1, v2) -> { - // This gets called for possible duplicates - // In that case, add them together - return v1 + v2; - }); - }); - } + return mostFrequentEntry.getKey(); + } - /** - * Returns the most frequent entry in the buffer - * @return most frequent entry as a string - */ - public String mode() { - Map.Entry mostFrequentEntry = null; - - for (Map.Entry entry : this.map.entrySet()) { - if (mostFrequentEntry == null) { - mostFrequentEntry = entry; - } - else if (mostFrequentEntry != null && entry.getValue() > mostFrequentEntry.getValue()) { - mostFrequentEntry = entry; - } - } - - return mostFrequentEntry.getKey(); - } - } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/PercentileBuffer.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/PercentileBuffer.java index cced5ae..a0eda94 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/PercentileBuffer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/PercentileBuffer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses; import java.io.Serializable; @@ -51,88 +50,95 @@ import java.util.List; /** - * Buffer used for {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ExactPercentileAggregator ExactPercentileAggregator} - * + * Buffer used for {@link com.teragrep.pth10.ast.commands.aggregate.UDAFs.ExactPercentileAggregator + * ExactPercentileAggregator} */ public class PercentileBuffer extends ListBuffer implements Serializable { - private static final long serialVersionUID = 1L; - private double percentile = 0.5d; - - /** - * Initialize the buffer with default percentile = 0.5d - */ - public PercentileBuffer() { - super(); - } - - /** - * Initialize the buffer with a custom percentile - * @param percentile values 0.0 - 1.0 (0 - 100%) - */ - public PercentileBuffer(double percentile) { - super(); - this.percentile = percentile; - } - - public double getPercentile() { - return this.percentile; - } - - public void setPercentile(double percentile) { - this.percentile = percentile; - } - - // Helper methods - - /** - * Merge list with another - * @param another list to merge with - */ - public void mergeList(List another) { - this.list.addAll(another); - } - - /** - * Add value to buffer - * @param value to add to the buffer - */ - public void add(Double value) { - this.list.add(value); - } - - /** - * Sort the internal list used by the buffer - */ - public void sortInternalList() { - Collections.sort(this.list); - } - - /** - * Calculates the percentile
- * If the amount of values is even, get the mean of the two middle values, - * otherwise get the middle value. - * @return percentile - */ - public double calculatePercentile() { - double nonRoundedIndex = this.percentile * (this.list.size() - 1); - int size = this.list.size(); - - int index = -1; - if (Math.abs(nonRoundedIndex - Math.ceil(nonRoundedIndex)) < Math.abs(nonRoundedIndex - Math.floor(nonRoundedIndex))) { - // difference from original to rounded up < down - - index = (int)Math.ceil(nonRoundedIndex); - } - else { - index = (int)Math.floor(nonRoundedIndex); - } - - if (size % 2 == 0 && index + 1 <= this.list.size() - 1) { - return (this.list.get(index) + this.list.get(index + 1))/2d; - } - else { - return (double) this.list.get(index); - } - } - + + private static final long serialVersionUID = 1L; + private double percentile = 0.5d; + + /** + * Initialize the buffer with default percentile = 0.5d + */ + public PercentileBuffer() { + super(); + } + + /** + * Initialize the buffer with a custom percentile + * + * @param percentile values 0.0 - 1.0 (0 - 100%) + */ + public PercentileBuffer(double percentile) { + super(); + this.percentile = percentile; + } + + public double getPercentile() { + return this.percentile; + } + + public void setPercentile(double percentile) { + this.percentile = percentile; + } + + // Helper methods + + /** + * Merge list with another + * + * @param another list to merge with + */ + public void mergeList(List another) { + this.list.addAll(another); + } + + /** + * Add value to buffer + * + * @param value to add to the buffer + */ + public void add(Double value) { + this.list.add(value); + } + + /** + * Sort the internal list used by the buffer + */ + public void sortInternalList() { + Collections.sort(this.list); + } + + /** + * Calculates the percentile
+ * If the amount of values is even, get the mean of the two middle values, otherwise get the middle value. + * + * @return percentile + */ + public double calculatePercentile() { + double nonRoundedIndex = this.percentile * (this.list.size() - 1); + int size = this.list.size(); + + int index = -1; + if ( + Math.abs(nonRoundedIndex - Math.ceil(nonRoundedIndex)) < Math + .abs(nonRoundedIndex - Math.floor(nonRoundedIndex)) + ) { + // difference from original to rounded up < down + + index = (int) Math.ceil(nonRoundedIndex); + } + else { + index = (int) Math.floor(nonRoundedIndex); + } + + if (size % 2 == 0 && index + 1 <= this.list.size() - 1) { + return (this.list.get(index) + this.list.get(index + 1)) / 2d; + } + else { + return (double) this.list.get(index); + } + } + } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/SumBuffer.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/SumBuffer.java index a110cef..f2878ad 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/SumBuffer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/SumBuffer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -52,6 +52,7 @@ import java.util.concurrent.atomic.AtomicReference; public final class SumBuffer implements Serializable { + private final static long serialVersionUID = 1L; private final AtomicReference doubleSum; private final AtomicLong longSum; diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/TimestampMapBuffer.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/TimestampMapBuffer.java index 4f1638c..b9445f4 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/TimestampMapBuffer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/TimestampMapBuffer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses; import java.io.Serializable; @@ -52,159 +51,169 @@ import java.util.Optional; /** - * Java Bean compliant class to enclose the map with helper methods - * used in EarliestLatestAggregator.java + * Java Bean compliant class to enclose the map with helper methods used in EarliestLatestAggregator.java + * * @author eemhu - * */ public class TimestampMapBuffer extends MapBuffer implements Serializable { - private static final long serialVersionUID = 1L; - - /** - * Merge the buffer's map with another - * @param another map to merge with - */ - public void mergeMap(Map another) { - another.forEach((key, value) -> { - this.map.merge(key, value, (v1, v2) -> { - // This gets called for possible duplicates - // In that case, retain the first value - return v1; - }); - }); - } - - /** - * Add Time, Data pair to map - * @param time key - * @param data value - */ - public void add(Timestamp time, String data) { - if (!this.map.containsKey(time)) { - this.map.put(time, data); - } - } - - /** - * Gets the earliest map entry - * @return Map.Entry - */ - public Optional> earliestMapEntry() { - Optional> earliestEntry = Optional.empty(); - - for (Map.Entry entry : this.map.entrySet()) { - if (!earliestEntry.isPresent()) { - earliestEntry = Optional.of(entry); - } - else if (entry.getKey().before(earliestEntry.get().getKey())) { - earliestEntry = Optional.of(entry); - } - } - - return earliestEntry; - } - - /** - * Gets the latest map entry - * @return Map.Entry - */ - public Optional> latestMapEntry() { - Optional> latestEntry = Optional.empty(); - - for (Map.Entry entry : this.map.entrySet()) { - if (!latestEntry.isPresent()) { - latestEntry = Optional.of(entry); - } - else if (entry.getKey().after(latestEntry.get().getKey())) { - latestEntry = Optional.of(entry); - } - } - - return latestEntry; - } - - /** - * Gets the earliest field value - * @return field value as string - */ - public String earliest() { - if (this.earliestMapEntry().isPresent()) { - return this.earliestMapEntry().get().getValue(); - } else { - return ""; - } - } - - /** - * Gets the latest field value - * @return field value as string - */ - public String latest() { - if (this.latestMapEntry().isPresent()) { - return this.latestMapEntry().get().getValue(); - } - else { - return ""; - } - } - - /** - * Gets the earliest unix time - * @return field time as unix epoch - */ - public String earliest_time() { - if (this.earliestMapEntry().isPresent()) { - return String.valueOf(this.earliestMapEntry().get().getKey().getTime() / 1000L); - } else { - return ""; - } - } - - /** - * Gets the latest unix time - * @return field time as unix epoch - */ - public String latest_time() { - if (this.latestMapEntry().isPresent()) { - return String.valueOf(this.latestMapEntry().get().getKey().getTime() / 1000L); - } else { - return ""; - } - } - - /** - * Calculates the rate
- *
rate = latest - earliest / latest_time - earliest_time
- * latest and earliest must be numerical
- * latest_time != earliest_time
- * @return rate as double - */ - public Double rate() { - Optional> earliestEntry = this.earliestMapEntry(); - Optional> latestEntry = this.latestMapEntry(); - if (!earliestEntry.isPresent() || !latestEntry.isPresent()) { - throw new IllegalStateException("Could not get earliest / latest entry from data!"); - } - - // get earliest and latest values - must be numerical! - long earliest = Long.parseLong(earliestEntry.get().getValue()); - long latest = Long.parseLong(latestEntry.get().getValue()); - - // get earliest and latest time - long earliest_time = earliestEntry.get().getKey().getTime() / 1000L; - long latest_time = latestEntry.get().getKey().getTime() / 1000L; - - if (earliest_time == latest_time) { - throw new IllegalStateException("Earliest time was the same as the latest time! Can't calculate rate."); - } - - // rate = latest - earliest / latest_time - earliest_time - double dividend = (double)(latest - earliest); - double divisor = (double)(latest_time - earliest_time); - double rate = dividend/divisor; - - return rate; - } + private static final long serialVersionUID = 1L; + + /** + * Merge the buffer's map with another + * + * @param another map to merge with + */ + public void mergeMap(Map another) { + another.forEach((key, value) -> { + this.map + .merge(key, value, (v1, v2) -> { + // This gets called for possible duplicates + // In that case, retain the first value + return v1; + }); + }); + } + + /** + * Add Time, Data pair to map + * + * @param time key + * @param data value + */ + public void add(Timestamp time, String data) { + if (!this.map.containsKey(time)) { + this.map.put(time, data); + } + } + + /** + * Gets the earliest map entry + * + * @return Map.Entry + */ + public Optional> earliestMapEntry() { + Optional> earliestEntry = Optional.empty(); + + for (Map.Entry entry : this.map.entrySet()) { + if (!earliestEntry.isPresent()) { + earliestEntry = Optional.of(entry); + } + else if (entry.getKey().before(earliestEntry.get().getKey())) { + earliestEntry = Optional.of(entry); + } + } + + return earliestEntry; + } + + /** + * Gets the latest map entry + * + * @return Map.Entry + */ + public Optional> latestMapEntry() { + Optional> latestEntry = Optional.empty(); + + for (Map.Entry entry : this.map.entrySet()) { + if (!latestEntry.isPresent()) { + latestEntry = Optional.of(entry); + } + else if (entry.getKey().after(latestEntry.get().getKey())) { + latestEntry = Optional.of(entry); + } + } + + return latestEntry; + } + + /** + * Gets the earliest field value + * + * @return field value as string + */ + public String earliest() { + if (this.earliestMapEntry().isPresent()) { + return this.earliestMapEntry().get().getValue(); + } + else { + return ""; + } + } + + /** + * Gets the latest field value + * + * @return field value as string + */ + public String latest() { + if (this.latestMapEntry().isPresent()) { + return this.latestMapEntry().get().getValue(); + } + else { + return ""; + } + } + + /** + * Gets the earliest unix time + * + * @return field time as unix epoch + */ + public String earliest_time() { + if (this.earliestMapEntry().isPresent()) { + return String.valueOf(this.earliestMapEntry().get().getKey().getTime() / 1000L); + } + else { + return ""; + } + } + + /** + * Gets the latest unix time + * + * @return field time as unix epoch + */ + public String latest_time() { + if (this.latestMapEntry().isPresent()) { + return String.valueOf(this.latestMapEntry().get().getKey().getTime() / 1000L); + } + else { + return ""; + } + } + + /** + * Calculates the rate
+ *
rate = latest - earliest / latest_time - earliest_time
latest and earliest must be numerical
+ * latest_time != earliest_time
+ * + * @return rate as double + */ + public Double rate() { + Optional> earliestEntry = this.earliestMapEntry(); + Optional> latestEntry = this.latestMapEntry(); + if (!earliestEntry.isPresent() || !latestEntry.isPresent()) { + throw new IllegalStateException("Could not get earliest / latest entry from data!"); + } + + // get earliest and latest values - must be numerical! + long earliest = Long.parseLong(earliestEntry.get().getValue()); + long latest = Long.parseLong(latestEntry.get().getValue()); + + // get earliest and latest time + long earliest_time = earliestEntry.get().getKey().getTime() / 1000L; + long latest_time = latestEntry.get().getKey().getTime() / 1000L; + + if (earliest_time == latest_time) { + throw new IllegalStateException("Earliest time was the same as the latest time! Can't calculate rate."); + } + + // rate = latest - earliest / latest_time - earliest_time + double dividend = (double) (latest - earliest); + double divisor = (double) (latest_time - earliest_time); + double rate = dividend / divisor; + + return rate; + } } - \ No newline at end of file diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ValuesBuffer.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ValuesBuffer.java index 7c79363..79a8085 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ValuesBuffer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/BufferClasses/ValuesBuffer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses; import java.io.Serializable; @@ -51,54 +50,57 @@ import java.util.stream.Collectors; /** - * Java Bean compliant class to enclose the array with helper methods - * Used as a buffer for ValuesAggregator + * Java Bean compliant class to enclose the array with helper methods Used as a buffer for ValuesAggregator + * * @author eemhu - * */ public class ValuesBuffer extends ListBuffer implements Serializable { - private static final long serialVersionUID = 1L; - - // Helper methods - /** - * Sort the internal list using lexicographical sorting and remove duplicates - */ - public void sortInternalList() { - this.list = this.list.stream().distinct().sorted(String::compareTo).collect(Collectors.toList()); - } + private static final long serialVersionUID = 1L; + + // Helper methods + + /** + * Sort the internal list using lexicographical sorting and remove duplicates + */ + public void sortInternalList() { + this.list = this.list.stream().distinct().sorted(String::compareTo).collect(Collectors.toList()); + } + + /** + * Merge list with another + * + * @param another list to merge with + */ + public void mergeList(List another) { + this.list.addAll(another); + } + + /** + * Add string data + * + * @param data to add to the buffer + */ + public void add(String data) { + this.list.add(data); + } - /** - * Merge list with another - * @param another list to merge with - */ - public void mergeList(List another) { - this.list.addAll(another); - } + /** + * Form the final result + * + * @return final result as string, separated by a new line
\n
+ */ + public String toString() { + String rv = ""; + int len = this.getSize(); - /** - * Add string data - * @param data to add to the buffer - */ - public void add(String data) { - this.list.add(data); - } + for (int i = 0; i < len; ++i) { + rv = rv.concat(this.list.get(i)); + if (i != len - 1) { + rv = rv.concat("\n"); + } + } - /** - * Form the final result - * @return final result as string, separated by a new line
\n
- */ - public String toString() { - String rv = ""; - int len = this.getSize(); - - for (int i = 0; i < len; ++i) { - rv = rv.concat(this.list.get(i)); - if (i != len - 1) { - rv = rv.concat("\n"); - } - } - - return rv; - } -} \ No newline at end of file + return rv; + } +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/CountAggregator.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/CountAggregator.java index 08a17e7..f6a8117 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/CountAggregator.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/CountAggregator.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.NullValue; @@ -56,10 +55,12 @@ public class CountAggregator extends Aggregator { private final String colName; private final NullValue nullValue; + public CountAggregator(String colName, NullValue nullValue) { this.colName = colName; this.nullValue = nullValue; } + @Override public Long zero() { return 0L; diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/DistinctCountAggregator.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/DistinctCountAggregator.java index da82641..9b71e0c 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/DistinctCountAggregator.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/DistinctCountAggregator.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.NullValue; @@ -56,66 +55,66 @@ import java.io.Serializable; /** - * Aggregator for command dc() - * Aggregator types: IN=Row, BUF=CountBuffer, OUT=String - * Serializable + * Aggregator for command dc() Aggregator types: IN=Row, BUF=CountBuffer, OUT=String Serializable + * * @author eemhu - * */ public class DistinctCountAggregator extends Aggregator implements Serializable { + private static final long serialVersionUID = 1L; - private final String colName; - private final NullValue nullValue; - - /** - * Constructor used to feed in the column name - * @param colName Column name for source field - * */ - public DistinctCountAggregator(String colName, NullValue nullValue) { - super(); - this.colName = colName; - this.nullValue = nullValue; - } - - /** Encoder for the buffer (class: Values)*/ - @Override - public Encoder bufferEncoder() { - // TODO using kryo should speed this up - return Encoders.javaSerialization(CountBuffer.class); - } + private final String colName; + private final NullValue nullValue; + + /** + * Constructor used to feed in the column name + * + * @param colName Column name for source field + */ + public DistinctCountAggregator(String colName, NullValue nullValue) { + super(); + this.colName = colName; + this.nullValue = nullValue; + } + + /** Encoder for the buffer (class: Values) */ + @Override + public Encoder bufferEncoder() { + // TODO using kryo should speed this up + return Encoders.javaSerialization(CountBuffer.class); + } - /** Encoder for the output (String of all the values in column, lexicographically sorted)*/ - @Override - public Encoder outputEncoder() { - return Encoders.INT(); - } + /** Encoder for the output (String of all the values in column, lexicographically sorted) */ + @Override + public Encoder outputEncoder() { + return Encoders.INT(); + } - /** Initialization */ - @Override - public CountBuffer zero() { - return new CountBuffer(); - } + /** Initialization */ + @Override + public CountBuffer zero() { + return new CountBuffer(); + } - /** Perform at the end of the aggregation */ - @Override - public Integer finish(CountBuffer buffer) { - return buffer.dc(); - } + /** Perform at the end of the aggregation */ + @Override + public Integer finish(CountBuffer buffer) { + return buffer.dc(); + } - /** Merge two buffers into one */ - @Override - public CountBuffer merge(CountBuffer buffer, CountBuffer buffer2) { - buffer.mergeMap(buffer2.getMap()); - return buffer; - } + /** Merge two buffers into one */ + @Override + public CountBuffer merge(CountBuffer buffer, CountBuffer buffer2) { + buffer.mergeMap(buffer2.getMap()); + return buffer; + } - /** Update array with new input value */ - @Override - public CountBuffer reduce(CountBuffer buffer, Row input) { - Object inputObject = input.getAs(colName); - if (inputObject != nullValue.value()) { - buffer.add(inputObject.toString()); - } - return buffer; - } + /** Update array with new input value */ + @Override + public CountBuffer reduce(CountBuffer buffer, Row input) { + Object inputObject = input.getAs(colName); + if (inputObject != nullValue.value()) { + buffer.add(inputObject.toString()); + } + return buffer; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator.java index 8b98fe3..7308b6a 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.TimestampMapBuffer; @@ -63,90 +62,96 @@ import java.time.format.DateTimeFormatter; /** - * Aggregator for commands earliest() and latest() - * - * Aggregator types: IN=Row, BUF=TimestampMapBuffer, OUT=String + * Aggregator for commands earliest() and latest() Aggregator types: IN=Row, BUF=TimestampMapBuffer, OUT=String * Serializable + * * @author eemhu - * */ -public abstract class EarliestLatestAggregator extends Aggregator implements Serializable { - private static final Logger LOGGER = LoggerFactory.getLogger(EarliestLatestAggregator.class); - - private static final long serialVersionUID = 1L; - private String colName = null; - private static final boolean debugEnabled = false; - - /** Constructor used to feed in the column name - * @param colName column name for source field - * */ - public EarliestLatestAggregator(String colName) { - super(); - this.colName = colName; - } - - /** Encoder for the buffer (class: TimestampMapBuffer) */ - @Override - public Encoder bufferEncoder() { - if (debugEnabled) LOGGER.info("Buffer encoder"); - - // TODO using kryo should speed this up - return Encoders.javaSerialization(TimestampMapBuffer.class); - } - - /** Abstract implementation for output encoder */ - @Override - public abstract Encoder outputEncoder(); - - /** Initialization */ - @Override - public TimestampMapBuffer zero() { - if (debugEnabled) LOGGER.info("zero"); - - return new TimestampMapBuffer(); - } - - /** Perform at the end of the aggregation */ - @Override - public abstract OUT finish(TimestampMapBuffer buffer); - - // Merge two buffers into one - @Override - public TimestampMapBuffer merge(TimestampMapBuffer buffer, TimestampMapBuffer buffer2) { - if (debugEnabled) LOGGER.info("merge"); - - buffer.mergeMap(buffer2.getMap()); - return buffer; - } - - /** Gets the timestamp column as a timestamp, even if it is a string instead of the proper TimestampType*/ - private Timestamp getColumnAsTimestamp(Row input) { - Timestamp rv = null; - try { - rv = (Timestamp) input.getAs("_time"); - } - catch (ClassCastException cce) { - // This should really never be needed, but it seems like the test reads timestamp in as a stringtype - // rather than a timestamp - String temp = input.getAs("_time").toString(); - - DateTimeFormatter formatter = DateTimeFormatter.ISO_ZONED_DATE_TIME; - ZonedDateTime zonedDateTime = LocalDateTime.from(formatter.parse(temp)).atZone(ZoneId.of("UTC")); - rv = Timestamp.from(Instant.ofEpochSecond(zonedDateTime.toEpochSecond())); - } - - return rv; - } - - /** Update TimestampMapBuffer with new value */ - @Override - public TimestampMapBuffer reduce(TimestampMapBuffer buffer, Row input) { - if (debugEnabled) LOGGER.info("reduce"); - - Timestamp time = getColumnAsTimestamp(input); - String val = input.getAs(colName).toString(); - buffer.add(time, val); - - return buffer; - } +public abstract class EarliestLatestAggregator extends Aggregator + implements Serializable { + + private static final Logger LOGGER = LoggerFactory.getLogger(EarliestLatestAggregator.class); + + private static final long serialVersionUID = 1L; + private String colName = null; + private static final boolean debugEnabled = false; + + /** + * Constructor used to feed in the column name + * + * @param colName column name for source field + */ + public EarliestLatestAggregator(String colName) { + super(); + this.colName = colName; + } + + /** Encoder for the buffer (class: TimestampMapBuffer) */ + @Override + public Encoder bufferEncoder() { + if (debugEnabled) + LOGGER.info("Buffer encoder"); + + // TODO using kryo should speed this up + return Encoders.javaSerialization(TimestampMapBuffer.class); + } + + /** Abstract implementation for output encoder */ + @Override + public abstract Encoder outputEncoder(); + + /** Initialization */ + @Override + public TimestampMapBuffer zero() { + if (debugEnabled) + LOGGER.info("zero"); + + return new TimestampMapBuffer(); + } + + /** Perform at the end of the aggregation */ + @Override + public abstract OUT finish(TimestampMapBuffer buffer); + + // Merge two buffers into one + @Override + public TimestampMapBuffer merge(TimestampMapBuffer buffer, TimestampMapBuffer buffer2) { + if (debugEnabled) + LOGGER.info("merge"); + + buffer.mergeMap(buffer2.getMap()); + return buffer; + } + + /** Gets the timestamp column as a timestamp, even if it is a string instead of the proper TimestampType */ + private Timestamp getColumnAsTimestamp(Row input) { + Timestamp rv = null; + try { + rv = (Timestamp) input.getAs("_time"); + } + catch (ClassCastException cce) { + // This should really never be needed, but it seems like the test reads timestamp in as a stringtype + // rather than a timestamp + String temp = input.getAs("_time").toString(); + + DateTimeFormatter formatter = DateTimeFormatter.ISO_ZONED_DATE_TIME; + ZonedDateTime zonedDateTime = LocalDateTime.from(formatter.parse(temp)).atZone(ZoneId.of("UTC")); + rv = Timestamp.from(Instant.ofEpochSecond(zonedDateTime.toEpochSecond())); + } + + return rv; + } + + /** Update TimestampMapBuffer with new value */ + @Override + public TimestampMapBuffer reduce(TimestampMapBuffer buffer, Row input) { + if (debugEnabled) + LOGGER.info("reduce"); + + Timestamp time = getColumnAsTimestamp(input); + String val = input.getAs(colName).toString(); + buffer.add(time, val); + + return buffer; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator_Double.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator_Double.java index 8f0fa64..63c982e 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator_Double.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator_Double.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.TimestampMapBuffer; @@ -56,43 +55,46 @@ * Used for rate() function */ public class EarliestLatestAggregator_Double extends EarliestLatestAggregator implements Serializable { - - private AggregatorMode.EarliestLatestAggregatorMode mode = AggregatorMode.EarliestLatestAggregatorMode.RATE; // 0=earliest, 1=latest, 2=earliest_time, 3=latest_time, 4=rate - /** - * Initialize with column and mode - * @param colName column name - * @param mode aggregator mode - */ - public EarliestLatestAggregator_Double(java.lang.String colName, AggregatorMode.EarliestLatestAggregatorMode mode) { - super(colName); - this.mode = mode; - } + private AggregatorMode.EarliestLatestAggregatorMode mode = AggregatorMode.EarliestLatestAggregatorMode.RATE; // 0=earliest, 1=latest, 2=earliest_time, 3=latest_time, 4=rate + + /** + * Initialize with column and mode + * + * @param colName column name + * @param mode aggregator mode + */ + public EarliestLatestAggregator_Double(java.lang.String colName, AggregatorMode.EarliestLatestAggregatorMode mode) { + super(colName); + this.mode = mode; + } + + private static final long serialVersionUID = 1L; - private static final long serialVersionUID = 1L; + /** + * Output encoder + * + * @return double encoder + */ + @Override + public Encoder outputEncoder() { + return Encoders.DOUBLE(); + } - /** - * Output encoder - * @return double encoder - */ - @Override - public Encoder outputEncoder() { - return Encoders.DOUBLE(); - } + /** + * Return the rate + * + * @param buffer buffer + * @return rate as double + */ + @Override + public Double finish(TimestampMapBuffer buffer) { + switch (this.mode) { + case RATE: // rate + return buffer.rate(); + default: // shouldn't happen, throw Exception + throw new UnsupportedOperationException("EarliestLatestAggregator was called with unsupported mode"); + } + } - /** - * Return the rate - * @param buffer buffer - * @return rate as double - */ - @Override - public Double finish(TimestampMapBuffer buffer) { - switch (this.mode) { - case RATE: // rate - return buffer.rate(); - default: // shouldn't happen, throw Exception - throw new UnsupportedOperationException("EarliestLatestAggregator was called with unsupported mode"); - } - } - } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator_String.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator_String.java index 13e09ec..f4a917a 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator_String.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/EarliestLatestAggregator_String.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.TimestampMapBuffer; @@ -54,53 +53,56 @@ /** * Used for earliest, latest, earliest_time, latest_time, rate - */ + */ public class EarliestLatestAggregator_String extends EarliestLatestAggregator implements Serializable { - - private AggregatorMode.EarliestLatestAggregatorMode mode = AggregatorMode.EarliestLatestAggregatorMode.EARLIEST; // 0=earliest, 1=latest, 2=earliest_time, 3=latest_time, 4=rate - /** - * Initialize with column name and mode - * @param colName column name - * @param mode aggregator mode - */ - public EarliestLatestAggregator_String(java.lang.String colName, AggregatorMode.EarliestLatestAggregatorMode mode) { - super(colName); - this.mode = mode; - } + private AggregatorMode.EarliestLatestAggregatorMode mode = AggregatorMode.EarliestLatestAggregatorMode.EARLIEST; // 0=earliest, 1=latest, 2=earliest_time, 3=latest_time, 4=rate + + /** + * Initialize with column name and mode + * + * @param colName column name + * @param mode aggregator mode + */ + public EarliestLatestAggregator_String(java.lang.String colName, AggregatorMode.EarliestLatestAggregatorMode mode) { + super(colName); + this.mode = mode; + } + + private static final long serialVersionUID = 1L; - private static final long serialVersionUID = 1L; + /** + * Gets the output encoder + * + * @return string output encoder + */ + @Override + public Encoder outputEncoder() { + return Encoders.STRING(); + } - /** - * Gets the output encoder - * @return string output encoder - */ - @Override - public Encoder outputEncoder() { - return Encoders.STRING(); - } + /** + * Performs the actual aggregation based on the mode + * + * @param buffer buffer + * @return result as a string + */ + @Override + public String finish(TimestampMapBuffer buffer) { + switch (this.mode) { + case EARLIEST: // earliest + return buffer.earliest(); + case LATEST: // latest + return buffer.latest(); + case EARLIEST_TIME: // earliest_time + return buffer.earliest_time().toString(); + case LATEST_TIME: // latest_time + return buffer.latest_time().toString(); + case RATE: // rate + return buffer.rate().toString(); + default: // shouldn't happen, throw Exception + throw new UnsupportedOperationException("EarliestLatestAggregator was called with unsupported mode"); + } + } - /** - * Performs the actual aggregation based on the mode - * @param buffer buffer - * @return result as a string - */ - @Override - public String finish(TimestampMapBuffer buffer) { - switch (this.mode) { - case EARLIEST: // earliest - return buffer.earliest(); - case LATEST: // latest - return buffer.latest(); - case EARLIEST_TIME: // earliest_time - return buffer.earliest_time().toString(); - case LATEST_TIME: // latest_time - return buffer.latest_time().toString(); - case RATE: // rate - return buffer.rate().toString(); - default: // shouldn't happen, throw Exception - throw new UnsupportedOperationException("EarliestLatestAggregator was called with unsupported mode"); - } - } - } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ExactPercentileAggregator.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ExactPercentileAggregator.java index 86a39a6..e728df2 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ExactPercentileAggregator.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ExactPercentileAggregator.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.PercentileBuffer; @@ -59,100 +58,107 @@ */ public class ExactPercentileAggregator extends Aggregator implements Serializable { - private static final long serialVersionUID = 1L; - private String colName = null; - private double percentile = 0.5d; - - /** - * Calculates the exact percentile using the Nearest Rank algorithm - * @param colName Source column name on dataframe - * @param percentile 0.0d-1.0d percentile to calculate - */ - public ExactPercentileAggregator(String colName, double percentile) { - this.colName = colName; - this.percentile = percentile; - } + private static final long serialVersionUID = 1L; + private String colName = null; + private double percentile = 0.5d; + + /** + * Calculates the exact percentile using the Nearest Rank algorithm + * + * @param colName Source column name on dataframe + * @param percentile 0.0d-1.0d percentile to calculate + */ + public ExactPercentileAggregator(String colName, double percentile) { + this.colName = colName; + this.percentile = percentile; + } + + /** + * Buffer encoder + * + * @return Encoder for PercentileBuffer + */ + @Override + public Encoder bufferEncoder() { + // TODO using kryo should speed this up + return Encoders.javaSerialization(PercentileBuffer.class); + } - /** - * Buffer encoder - * @return Encoder for PercentileBuffer - */ - @Override - public Encoder bufferEncoder() { - // TODO using kryo should speed this up - return Encoders.javaSerialization(PercentileBuffer.class); - } + /** + * sort and calculate percentile + * + * @param buffer PercentileBuffer + * @return percentile as double + */ + @Override + public Double finish(PercentileBuffer buffer) { + buffer.sortInternalList(); + return buffer.calculatePercentile(); + } - /** - * sort and calculate percentile - * @param buffer PercentileBuffer - * @return percentile as double - */ - @Override - public Double finish(PercentileBuffer buffer) { - buffer.sortInternalList(); - return buffer.calculatePercentile(); - } + /** + * Merge two PercentileBuffers + * + * @param buffer Original buffer + * @param buffer2 Buffer to merge to original + * @return resulting buffer + */ + @Override + public PercentileBuffer merge(PercentileBuffer buffer, PercentileBuffer buffer2) { + buffer.mergeList(buffer2.getList()); + return buffer; + } - /** - * Merge two PercentileBuffers - * @param buffer Original buffer - * @param buffer2 Buffer to merge to original - * @return resulting buffer - */ - @Override - public PercentileBuffer merge(PercentileBuffer buffer, PercentileBuffer buffer2) { - buffer.mergeList(buffer2.getList()); - return buffer; - } + /** + * Output encoder + * + * @return double encoder + */ + @Override + public Encoder outputEncoder() { + return Encoders.DOUBLE(); + } - /** - * Output encoder - * @return double encoder - */ - @Override - public Encoder outputEncoder() { - return Encoders.DOUBLE(); - } + /** + * Add new data to buffer + * + * @param buffer Buffer + * @param input input row + * @return Buffer with input row added + */ + @Override + public PercentileBuffer reduce(PercentileBuffer buffer, Row input) { + Object inputValue = input.getAs(colName); + Double value = null; - /** - * Add new data to buffer - * @param buffer Buffer - * @param input input row - * @return Buffer with input row added - */ - @Override - public PercentileBuffer reduce(PercentileBuffer buffer, Row input) { - Object inputValue = input.getAs(colName); - Double value = null; + if (inputValue instanceof Long) { + value = ((Long) inputValue).doubleValue(); + } + else if (inputValue instanceof Integer) { + value = ((Integer) inputValue).doubleValue(); + } + else if (inputValue instanceof Float) { + value = ((Float) inputValue).doubleValue(); + } + else if (inputValue instanceof Double) { + value = ((Double) inputValue); + } + else if (inputValue instanceof String) { + value = Double.valueOf((String) inputValue); + } - if (inputValue instanceof Long) { - value = ((Long) inputValue).doubleValue(); - } - else if (inputValue instanceof Integer) { - value = ((Integer) inputValue).doubleValue(); - } - else if (inputValue instanceof Float) { - value = ((Float) inputValue).doubleValue(); - } - else if (inputValue instanceof Double) { - value = ((Double) inputValue); - } - else if (inputValue instanceof String) { - value = Double.valueOf((String)inputValue); - } - - buffer.add(value); - return buffer; - } + buffer.add(value); + return buffer; + } - /** - * Initialize the buffer - * @return initialized buffer - */ - @Override - public PercentileBuffer zero() { - return new PercentileBuffer(this.percentile); - } + /** + * Initialize the buffer + * + * @return initialized buffer + */ + @Override + public PercentileBuffer zero() { + return new PercentileBuffer(this.percentile); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndex.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndex.java index 239c36d..a7575bb 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndex.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndex.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -48,5 +48,6 @@ import org.apache.spark.sql.Row; public interface FieldIndex { + int fieldIndex(Row row); } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndexImpl.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndexImpl.java index cdbc8be..22320e5 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndexImpl.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndexImpl.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -50,8 +50,10 @@ import java.io.Serializable; public class FieldIndexImpl implements FieldIndex, Serializable { + private static final long serialVersionUID = 1L; private final String field; + public FieldIndexImpl(String field) { this.field = field; } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndexStub.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndexStub.java index b620342..f714912 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndexStub.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/FieldIndex/FieldIndexStub.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -50,7 +50,9 @@ import java.io.Serializable; public class FieldIndexStub implements FieldIndex, Serializable { + private static final long serialVersionUID = 1L; + @Override public int fieldIndex(Row row) { return 0; diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/MinMaxAggregator.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/MinMaxAggregator.java index 8bb7540..4d74ad1 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/MinMaxAggregator.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/MinMaxAggregator.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.MinMaxBuffer; @@ -59,15 +58,17 @@ * Aggregator used for commands min(), max() and range() */ public class MinMaxAggregator extends Aggregator implements Serializable { + private static final long serialVersionUID = 1L; private final String colName; private final AggregatorMode.MinMaxAggregatorMode mode; /** - * Constructor for MinMaxAggregator, where the column name of the target column - * and the aggregator mode must be specified. + * Constructor for MinMaxAggregator, where the column name of the target column and the aggregator mode must be + * specified. + * * @param colName target column name - * @param mode MinMaxAggregator mode + * @param mode MinMaxAggregator mode */ public MinMaxAggregator(String colName, AggregatorMode.MinMaxAggregatorMode mode) { this.colName = colName; @@ -76,6 +77,7 @@ public MinMaxAggregator(String colName, AggregatorMode.MinMaxAggregatorMode mode /** * Format the buffer (or "zero") in the beginning. + * * @return ready-to-go buffer */ @Override @@ -85,8 +87,9 @@ public MinMaxBuffer zero() { /** * Reduce the buffer, aka add more data to it. + * * @param minMaxBuffer Buffer - * @param input Row of input + * @param input Row of input * @return Buffer with input added */ @Override @@ -94,18 +97,18 @@ public MinMaxBuffer reduce(MinMaxBuffer minMaxBuffer, Row input) { Object inputValue = input.getAs(colName); if (inputValue instanceof Long) { - minMaxBuffer.addNumber(((Long)inputValue).doubleValue()); + minMaxBuffer.addNumber(((Long) inputValue).doubleValue()); } else if (inputValue instanceof Double) { - minMaxBuffer.addNumber(((Double)inputValue)); + minMaxBuffer.addNumber(((Double) inputValue)); minMaxBuffer.setOutputFormatType(DataTypes.DoubleType.typeName()); // set to double } else if (inputValue instanceof Float) { - minMaxBuffer.addNumber(((Float)inputValue).doubleValue()); + minMaxBuffer.addNumber(((Float) inputValue).doubleValue()); minMaxBuffer.setOutputFormatType(DataTypes.DoubleType.typeName()); // set to double } else if (inputValue instanceof Integer) { - minMaxBuffer.addNumber(((Integer)inputValue).doubleValue()); + minMaxBuffer.addNumber(((Integer) inputValue).doubleValue()); } // If input is a string... else if (inputValue instanceof String) { @@ -115,13 +118,13 @@ else if (inputValue instanceof String) { } catch (NumberFormatException nfe) { try { // if it fails, try double - Double parsed = Double.valueOf((String)inputValue); + Double parsed = Double.valueOf((String) inputValue); minMaxBuffer.addNumber(parsed); minMaxBuffer.setOutputFormatType(DataTypes.DoubleType.typeName()); // set to double } catch (NumberFormatException nfe2) { // if that fails, add to string list - minMaxBuffer.addString((String)inputValue); + minMaxBuffer.addString((String) inputValue); } } } @@ -134,6 +137,7 @@ else if (inputValue instanceof String) { /** * Merge two buffers into one, from different executors. + * * @param buf1 Buffer #1 * @param buf2 Buffer #2 * @return merged buffer @@ -153,6 +157,7 @@ public MinMaxBuffer merge(MinMaxBuffer buf1, MinMaxBuffer buf2) { /** * Finish the aggregation + * * @param minMaxBuffer Final buffer * @return Result */ @@ -161,28 +166,36 @@ public String finish(MinMaxBuffer minMaxBuffer) { if (this.mode == AggregatorMode.MinMaxAggregatorMode.MAX) { if (minMaxBuffer.getMaxString() != null) { return minMaxBuffer.getMaxString(); - } else if (minMaxBuffer.getOutputFormatType().equals(DataTypes.DoubleType.typeName())) { + } + else if (minMaxBuffer.getOutputFormatType().equals(DataTypes.DoubleType.typeName())) { return minMaxBuffer.getMaxNumber().toString(); - } else { + } + else { return String.valueOf(minMaxBuffer.getMaxNumber().intValue()); } } else if (this.mode == AggregatorMode.MinMaxAggregatorMode.MIN) { if (minMaxBuffer.getMinString() != null) { return minMaxBuffer.getMinString(); - } else if (minMaxBuffer.getOutputFormatType().equals(DataTypes.DoubleType.typeName())) { + } + else if (minMaxBuffer.getOutputFormatType().equals(DataTypes.DoubleType.typeName())) { return minMaxBuffer.getMinNumber().toString(); - } else { + } + else { return String.valueOf(minMaxBuffer.getMinNumber().intValue()); } } else if (this.mode == AggregatorMode.MinMaxAggregatorMode.RANGE) { if (minMaxBuffer.getMinString() != null || minMaxBuffer.getMaxString() != null) { - throw new RuntimeException("Aggregate function range() requires only numeric values, but strings were found."); - } else if (minMaxBuffer.getOutputFormatType().equals(DataTypes.DoubleType.typeName())) { + throw new RuntimeException( + "Aggregate function range() requires only numeric values, but strings were found." + ); + } + else if (minMaxBuffer.getOutputFormatType().equals(DataTypes.DoubleType.typeName())) { Double range = minMaxBuffer.getMaxNumber() - minMaxBuffer.getMinNumber(); return range.toString(); - } else { + } + else { Double range = minMaxBuffer.getMaxNumber() - minMaxBuffer.getMinNumber(); return String.valueOf(range.intValue()); } @@ -192,6 +205,7 @@ else if (this.mode == AggregatorMode.MinMaxAggregatorMode.RANGE) { /** * Encoder for buffer + * * @return Buffer encoder */ @Override @@ -202,6 +216,7 @@ public Encoder bufferEncoder() { /** * Encoder for output + * * @return Output encoder */ @Override diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ModeAggregator.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ModeAggregator.java index 96e5f8e..de63d1a 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ModeAggregator.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ModeAggregator.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.ModeBuffer; @@ -57,80 +56,88 @@ /** * Aggregator used for the command mode() */ -public class ModeAggregator extends Aggregator implements Serializable{ - private static final long serialVersionUID = 1L; - private String colName = null; +public class ModeAggregator extends Aggregator implements Serializable { + + private static final long serialVersionUID = 1L; + private String colName = null; + + /** + * Initialize with the column name + * + * @param colName name of the target column + */ + public ModeAggregator(String colName) { + this.colName = colName; + } - /** - * Initialize with the column name - * @param colName name of the target column - */ - public ModeAggregator(String colName) { - this.colName = colName; - } + /** + * Buffer encoder + * + * @return ModeBuffer encoder + */ + @Override + public Encoder bufferEncoder() { + // TODO kryo should speed this up + return Encoders.javaSerialization(ModeBuffer.class); + } - /** - * Buffer encoder - * @return ModeBuffer encoder - */ - @Override - public Encoder bufferEncoder() { - // TODO kryo should speed this up - return Encoders.javaSerialization(ModeBuffer.class); - } + /** + * Return the result as string + * + * @param buffer ModeBuffer + * @return result as string + */ + @Override + public String finish(ModeBuffer buffer) { + return buffer.mode(); + } - /** - * Return the result as string - * @param buffer ModeBuffer - * @return result as string - */ - @Override - public String finish(ModeBuffer buffer) { - return buffer.mode(); - } + /** + * Merge two buffers into one + * + * @param buffer original + * @param buffer2 another + * @return merged buffer + */ + @Override + public ModeBuffer merge(ModeBuffer buffer, ModeBuffer buffer2) { + buffer.mergeMap(buffer2.getMap()); + return buffer; + } - /** - * Merge two buffers into one - * @param buffer original - * @param buffer2 another - * @return merged buffer - */ - @Override - public ModeBuffer merge(ModeBuffer buffer, ModeBuffer buffer2) { - buffer.mergeMap(buffer2.getMap()); - return buffer; - } + /** + * Output encoder + * + * @return String encoder + */ + @Override + public Encoder outputEncoder() { + return Encoders.STRING(); + } - /** - * Output encoder - * @return String encoder - */ - @Override - public Encoder outputEncoder() { - return Encoders.STRING(); - } + /** + * Add new data to the buffer + * + * @param buffer target buffer + * @param input input row + * @return resulting buffer + */ + @Override + public ModeBuffer reduce(ModeBuffer buffer, Row input) { + String inputValue = input.getAs(colName).toString(); + buffer.add(inputValue); - /** - * Add new data to the buffer - * @param buffer target buffer - * @param input input row - * @return resulting buffer - */ - @Override - public ModeBuffer reduce(ModeBuffer buffer, Row input) { - String inputValue = input.getAs(colName).toString(); - buffer.add(inputValue); - - return buffer; - } + return buffer; + } - /** - * Initialize the ModeBuffer - * @return initialized buffer - */ - @Override - public ModeBuffer zero() { - return new ModeBuffer(); - } + /** + * Initialize the ModeBuffer + * + * @return initialized buffer + */ + @Override + public ModeBuffer zero() { + return new ModeBuffer(); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/SumAggregator.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/SumAggregator.java index 6e4a37e..eaa9191 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/SumAggregator.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/SumAggregator.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.NullValue; diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/UDAF_DistinctCount.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/UDAF_DistinctCount.java index 8bfd815..6d13269 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/UDAF_DistinctCount.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/UDAF_DistinctCount.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import org.apache.spark.sql.Row; @@ -60,148 +59,160 @@ import java.util.Map; /** - * UDAF for estdc_error() - * TODO Remove this, when estdc_error can be used without it + * UDAF for estdc_error() TODO Remove this, when estdc_error can be used without it */ public class UDAF_DistinctCount extends UserDefinedAggregateFunction { - private static final Logger LOGGER = LoggerFactory.getLogger(UDAF_DistinctCount.class); - private static final long serialVersionUID = 1L; - - /** - * Buffer schema using a map type of {@literal } - * @return structType - */ - @Override - public StructType bufferSchema() { - return new StructType(new StructField[] { - DataTypes.createStructField("mapOfValues", DataTypes.createMapType(DataTypes.StringType, DataTypes.LongType), false) - }); - } - - /** - * output datatype - * @return integer type - */ - @Override - public DataType dataType() { - return DataTypes.IntegerType; - } - - /** - * Same input returns the same output every time - * @return boolean true - */ - @Override - public boolean deterministic() { - return true; - } - - /** - * Return the final result - * @param buffer Row buffer - * @return result as an integer - */ - @Override - public Integer evaluate(Row buffer) { - // getJavaMap() returns map with Object,Object K,V pair - java.util.Map map = buffer.getJavaMap(0); - - // the size of the map is the distinct count - return map.size(); - } - - /** - * Init buffers used for processing (see bufferSchema()) - * @param buffer buffer to initialize - */ - @Override - public void initialize(MutableAggregationBuffer buffer) { - // Update first index with value (index, value) - buffer.update(0, new HashMap()); - } - - /** Schema used for input column */ - @Override - public StructType inputSchema() { - return new StructType(new StructField[] { - DataTypes.createStructField("input", DataTypes.StringType, true) - }); - } - - /** - * Merge two buffers - * @param buffer1 original - * @param buffer2 another - */ - @Override - public void merge(MutableAggregationBuffer buffer1, Row buffer2) { - // Buffer and row to be merged - java.util.Map map1 = buffer1.getJavaMap(0); - java.util.Map map2 = buffer2.getJavaMap(0); - - // Result map - java.util.Map map3 = new HashMap<>(map1); - - // Go through each k,v pair on map2; merge with map3 and process duplicates - map2.forEach((key, value) -> { - map3.merge(key, value, (v1, v2) -> { - // This gets called for possible duplicates in map2 and map3. - // In that case, add the values together - return castObjectToLong(v1) + castObjectToLong(v2); - }); - }); - - // Update buffer with result map - buffer1.update(0, map3); - } - - /** - * Add more data to the buffer - * @param buffer buffer - * @param input input data - */ - @Override - public void update(MutableAggregationBuffer buffer, Row input) { - // getJavaMap() returns a Scala Map wrapped in an Java Object, - // which does not support put(). the map must be copied to a new map for put() to work - Map javaWrappedScalaMap = buffer.getJavaMap(0); - Map current = new HashMap<>(javaWrappedScalaMap); - String inputString = input.getString(0); - // current.put(inputString, current.containsKey(inputString) ? current.get(inputString) ); - - if (current.containsKey((Object)inputString)) { - Long currentValue = castObjectToLong(current.get(inputString)); - current.put((Object)inputString, (Object)(currentValue + 1L)); - } - else { - current.put((Object)inputString, (Object)1L); - } - - - buffer.update(0, current); - } - - /** getJavaMap() returns as {@literal Map} even though it is more like {@literal Map} - thus we need a helper method to cast Object->Long. For Object->String, Object.toString() can be used */ - private Long castObjectToLong(Object o) { - Long rv = null; - try { - if (o instanceof Long) { - rv = ((Long) o).longValue(); - } - else if (o instanceof Integer) { - rv = ((Integer) o).longValue(); - } - else if (o instanceof String) { - rv = Long.valueOf(((String) o)); - } - } - catch (Exception e) { - LOGGER.error("UDAF_DistinctCount: Error casting Object to Long"); - throw e; - } - - return rv; - } + + private static final Logger LOGGER = LoggerFactory.getLogger(UDAF_DistinctCount.class); + private static final long serialVersionUID = 1L; + + /** + * Buffer schema using a map type of {@literal } + * + * @return structType + */ + @Override + public StructType bufferSchema() { + return new StructType(new StructField[] { + DataTypes + .createStructField( + "mapOfValues", DataTypes.createMapType(DataTypes.StringType, DataTypes.LongType), false + ) + }); + } + + /** + * output datatype + * + * @return integer type + */ + @Override + public DataType dataType() { + return DataTypes.IntegerType; + } + + /** + * Same input returns the same output every time + * + * @return boolean true + */ + @Override + public boolean deterministic() { + return true; + } + + /** + * Return the final result + * + * @param buffer Row buffer + * @return result as an integer + */ + @Override + public Integer evaluate(Row buffer) { + // getJavaMap() returns map with Object,Object K,V pair + java.util.Map map = buffer.getJavaMap(0); + + // the size of the map is the distinct count + return map.size(); + } + + /** + * Init buffers used for processing (see bufferSchema()) + * + * @param buffer buffer to initialize + */ + @Override + public void initialize(MutableAggregationBuffer buffer) { + // Update first index with value (index, value) + buffer.update(0, new HashMap()); + } + + /** Schema used for input column */ + @Override + public StructType inputSchema() { + return new StructType(new StructField[] { + DataTypes.createStructField("input", DataTypes.StringType, true) + }); + } + + /** + * Merge two buffers + * + * @param buffer1 original + * @param buffer2 another + */ + @Override + public void merge(MutableAggregationBuffer buffer1, Row buffer2) { + // Buffer and row to be merged + java.util.Map map1 = buffer1.getJavaMap(0); + java.util.Map map2 = buffer2.getJavaMap(0); + + // Result map + java.util.Map map3 = new HashMap<>(map1); + + // Go through each k,v pair on map2; merge with map3 and process duplicates + map2.forEach((key, value) -> { + map3 + .merge(key, value, (v1, v2) -> { + // This gets called for possible duplicates in map2 and map3. + // In that case, add the values together + return castObjectToLong(v1) + castObjectToLong(v2); + }); + }); + + // Update buffer with result map + buffer1.update(0, map3); + } + + /** + * Add more data to the buffer + * + * @param buffer buffer + * @param input input data + */ + @Override + public void update(MutableAggregationBuffer buffer, Row input) { + // getJavaMap() returns a Scala Map wrapped in an Java Object, + // which does not support put(). the map must be copied to a new map for put() to work + Map javaWrappedScalaMap = buffer.getJavaMap(0); + Map current = new HashMap<>(javaWrappedScalaMap); + String inputString = input.getString(0); + // current.put(inputString, current.containsKey(inputString) ? current.get(inputString) ); + + if (current.containsKey((Object) inputString)) { + Long currentValue = castObjectToLong(current.get(inputString)); + current.put((Object) inputString, (Object) (currentValue + 1L)); + } + else { + current.put((Object) inputString, (Object) 1L); + } + + buffer.update(0, current); + } + + /** + * getJavaMap() returns as {@literal Map} even though it is more like {@literal Map} + * thus we need a helper method to cast Object->Long. For Object->String, Object.toString() can be used + */ + private Long castObjectToLong(Object o) { + Long rv = null; + try { + if (o instanceof Long) { + rv = ((Long) o).longValue(); + } + else if (o instanceof Integer) { + rv = ((Integer) o).longValue(); + } + else if (o instanceof String) { + rv = Long.valueOf(((String) o)); + } + } + catch (Exception e) { + LOGGER.error("UDAF_DistinctCount: Error casting Object to Long"); + throw e; + } + + return rv; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ValuesAggregator.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ValuesAggregator.java index c279276..599392e 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ValuesAggregator.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/UDAFs/ValuesAggregator.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.UDAFs; import com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.ValuesBuffer; @@ -58,116 +57,131 @@ import java.util.stream.Collectors; /** - * Aggregator for commands values() and list() + * Aggregator for commands values() and list() Aggregator types: IN=Row, BUF=Values, OUT=String Serializable * - * Aggregator types: IN=Row, BUF=Values, OUT=String - * Serializable * @author eemhu - * */ public class ValuesAggregator extends Aggregator implements Serializable { - private static final Logger LOGGER = LoggerFactory.getLogger(ValuesAggregator.class); - - private static final long serialVersionUID = 1L; - private static final boolean debugEnabled = false; - - private static final int maxAmountOfValues = 100; // for list() - private String colName = null; - - private AggregatorMode.ValuesAggregatorMode mode = AggregatorMode.ValuesAggregatorMode.VALUES; // values() or list() - - /** Constructor used to feed in the column name - * @param colName column name - * @param mode Aggregator mode - * */ - public ValuesAggregator(String colName, AggregatorMode.ValuesAggregatorMode mode) { - super(); - this.colName = colName; - this.mode = mode; - } - - /** Encoder for the buffer (class: ValuesBuffer) - * @return encoder for ValuesBuffer - * */ - @Override - public Encoder bufferEncoder() { - if (debugEnabled) LOGGER.info("Buffer encoder"); - - // TODO using kryo should speed this up - return Encoders.javaSerialization(ValuesBuffer.class); - } - - /** Encoder for the output (String of all the values in column, lexicographically sorted) - * @return encoder for string - * */ - @Override - public Encoder outputEncoder() { - if (debugEnabled) LOGGER.info("Output encoder"); - - return Encoders.STRING(); - } - - /** Initialization - * @return initialized buffer - * */ - @Override - public ValuesBuffer zero() { - if (debugEnabled) LOGGER.info("zero"); - - return new ValuesBuffer(); - } - - /** - * Perform at the end of the aggregation - * @param buffer buffer - * */ - @Override - public String finish(ValuesBuffer buffer) { - if (debugEnabled) LOGGER.info("finish"); - - if (mode == AggregatorMode.ValuesAggregatorMode.VALUES) { - // values() needs to be sorted in lexicographical order - // (default java string-to-string comparison order) - buffer.sortInternalList(); - - } - else if (mode == AggregatorMode.ValuesAggregatorMode.LIST) { - // list() is limited to 100 first values in input order - if (buffer.getSize() > maxAmountOfValues) { - buffer.setList(buffer.getList().stream().limit(maxAmountOfValues).collect(Collectors.toList())); - } - } - - return buffer.toString(); - } - - /** - * Merge two buffers into one - * @param buffer original - * @param buffer2 another - * @return resulting buffer - * */ - @Override - public ValuesBuffer merge(ValuesBuffer buffer, ValuesBuffer buffer2) { - if (debugEnabled) LOGGER.info("merge"); - - buffer.mergeList(buffer2.getList()); - return buffer; - } - - /** - * Update array with new input value - * @param buffer buffer - * @param input input row - * @return resulting buffer - * */ - @Override - public ValuesBuffer reduce(ValuesBuffer buffer, Row input) { - if (debugEnabled) LOGGER.info("reduce"); - - String inputString = input.getAs(colName).toString(); - buffer.add(inputString); - - return buffer; - } + + private static final Logger LOGGER = LoggerFactory.getLogger(ValuesAggregator.class); + + private static final long serialVersionUID = 1L; + private static final boolean debugEnabled = false; + + private static final int maxAmountOfValues = 100; // for list() + private String colName = null; + + private AggregatorMode.ValuesAggregatorMode mode = AggregatorMode.ValuesAggregatorMode.VALUES; // values() or list() + + /** + * Constructor used to feed in the column name + * + * @param colName column name + * @param mode Aggregator mode + */ + public ValuesAggregator(String colName, AggregatorMode.ValuesAggregatorMode mode) { + super(); + this.colName = colName; + this.mode = mode; + } + + /** + * Encoder for the buffer (class: ValuesBuffer) + * + * @return encoder for ValuesBuffer + */ + @Override + public Encoder bufferEncoder() { + if (debugEnabled) + LOGGER.info("Buffer encoder"); + + // TODO using kryo should speed this up + return Encoders.javaSerialization(ValuesBuffer.class); + } + + /** + * Encoder for the output (String of all the values in column, lexicographically sorted) + * + * @return encoder for string + */ + @Override + public Encoder outputEncoder() { + if (debugEnabled) + LOGGER.info("Output encoder"); + + return Encoders.STRING(); + } + + /** + * Initialization + * + * @return initialized buffer + */ + @Override + public ValuesBuffer zero() { + if (debugEnabled) + LOGGER.info("zero"); + + return new ValuesBuffer(); + } + + /** + * Perform at the end of the aggregation + * + * @param buffer buffer + */ + @Override + public String finish(ValuesBuffer buffer) { + if (debugEnabled) + LOGGER.info("finish"); + + if (mode == AggregatorMode.ValuesAggregatorMode.VALUES) { + // values() needs to be sorted in lexicographical order + // (default java string-to-string comparison order) + buffer.sortInternalList(); + + } + else if (mode == AggregatorMode.ValuesAggregatorMode.LIST) { + // list() is limited to 100 first values in input order + if (buffer.getSize() > maxAmountOfValues) { + buffer.setList(buffer.getList().stream().limit(maxAmountOfValues).collect(Collectors.toList())); + } + } + + return buffer.toString(); + } + + /** + * Merge two buffers into one + * + * @param buffer original + * @param buffer2 another + * @return resulting buffer + */ + @Override + public ValuesBuffer merge(ValuesBuffer buffer, ValuesBuffer buffer2) { + if (debugEnabled) + LOGGER.info("merge"); + + buffer.mergeList(buffer2.getList()); + return buffer; + } + + /** + * Update array with new input value + * + * @param buffer buffer + * @param input input row + * @return resulting buffer + */ + @Override + public ValuesBuffer reduce(ValuesBuffer buffer, Row input) { + if (debugEnabled) + LOGGER.info("reduce"); + + String inputString = input.getAs(colName).toString(); + buffer.add(inputString); + + return buffer; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/utils/PercentileApprox.java b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/utils/PercentileApprox.java index 6bfb638..e7b0996 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/aggregate/utils/PercentileApprox.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/aggregate/utils/PercentileApprox.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.aggregate.utils; import org.apache.spark.sql.Column; @@ -56,29 +55,34 @@ */ public class PercentileApprox { - /** - * Constructor for the PercentileApprox class - */ - public PercentileApprox() { } - - /** - * Calculates the approximate percentile - * @param col Column containing the values - * @param percentage Xth percentile (0.0 - 1.0) - * @param accuracy 1.0/accuracy = relative error of the approximation - * @return Column for the aggregate - */ - public Column percentile_approx(Column col, Column percentage, Column accuracy) { - AggregateExpression expr = new ApproximatePercentile(col.expr(), percentage.expr(), accuracy.expr()).toAggregateExpression(); - return new Column(expr); - } - /** - * Calculates the approximate percentile with 10000 accuracy (spark default) - * @param col Column containing the values - * @param percentage Xth percentile (0.0 - 1.0) - * @return Column for the aggregate - */ - public Column percentile_approx(Column col, Column percentage) { - return percentile_approx(col, percentage, functions.lit(ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY())); - } + /** + * Constructor for the PercentileApprox class + */ + public PercentileApprox() { + } + + /** + * Calculates the approximate percentile + * + * @param col Column containing the values + * @param percentage Xth percentile (0.0 - 1.0) + * @param accuracy 1.0/accuracy = relative error of the approximation + * @return Column for the aggregate + */ + public Column percentile_approx(Column col, Column percentage, Column accuracy) { + AggregateExpression expr = new ApproximatePercentile(col.expr(), percentage.expr(), accuracy.expr()) + .toAggregateExpression(); + return new Column(expr); + } + + /** + * Calculates the approximate percentile with 10000 accuracy (spark default) + * + * @param col Column containing the values + * @param percentage Xth percentile (0.0 - 1.0) + * @return Column for the aggregate + */ + public Column percentile_approx(Column col, Column percentage) { + return percentile_approx(col, percentage, functions.lit(ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY())); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/EvalStatement.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/EvalStatement.java index 9771182..ed36840 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/EvalStatement.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/EvalStatement.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -75,10 +74,11 @@ import static com.teragrep.jue_01.GlobToRegEx.regexify; /** - * Base statement for evaluation functions. - * Called from {@link com.teragrep.pth10.ast.commands.transformstatement.EvalTransformation} + * Base statement for evaluation functions. Called from + * {@link com.teragrep.pth10.ast.commands.transformstatement.EvalTransformation} */ public class EvalStatement extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(EvalStatement.class); private final DPLParserCatalystContext catCtx; @@ -88,6 +88,7 @@ public class EvalStatement extends DPLParserBaseVisitor { /** * Initialize evalStatement + * * @param catCtx Catalyst context object */ public EvalStatement(DPLParserCatalystContext catCtx) { @@ -96,6 +97,7 @@ public EvalStatement(DPLParserCatalystContext catCtx) { /** * Main visitor function for evalStatement + * * @param ctx main parse tree * @return node */ @@ -114,10 +116,11 @@ private Node evalStatementEmitCatalyst(DPLParser.EvalStatementContext ctx) { if (ctx.getChildCount() == 1) { // leaf rv = left; - } else if (ctx.getChildCount() == 2) { - throw new RuntimeException( - "Unbalanced evalStatement operation:" + ctx.getText()); - } else if (ctx.getChildCount() == 3) { + } + else if (ctx.getChildCount() == 2) { + throw new RuntimeException("Unbalanced evalStatement operation:" + ctx.getText()); + } + else if (ctx.getChildCount() == 3) { // logical operation xxx AND/OR/XOR xxx TerminalNode operation = (TerminalNode) ctx.getChild(1); Token op = getOperation((TerminalNode) ctx.getChild(1)); @@ -158,21 +161,26 @@ private Node evalStatementEmitCatalyst(DPLParser.EvalStatementContext ctx) { } public Node visitL_evalStatement_subEvalStatement(DPLParser.L_evalStatement_subEvalStatementContext ctx) { - LOGGER.info("VisitSubEvalStatements: children=<{}> text=<{}>",ctx.getChildCount(), ctx.getChild(0).getText()); + LOGGER.info("VisitSubEvalStatements: children=<{}> text=<{}>", ctx.getChildCount(), ctx.getChild(0).getText()); // Consume parenthesis and return actual evalStatement - Node rv =visit(ctx.getChild(0)); - LOGGER.debug("VisitSubEvalStatements children=<{}> rv=<{}> class=<{}>", ctx.getChildCount(), rv, rv.getClass().getName()); + Node rv = visit(ctx.getChild(0)); + LOGGER + .debug( + "VisitSubEvalStatements children=<{}> rv=<{}> class=<{}>", ctx.getChildCount(), rv, + rv.getClass().getName() + ); //return new ColumnNode(functions.expr("true")); return rv; } - @Override public Node visitEvalFunctionStatement(DPLParser.EvalFunctionStatementContext ctx) { - Node rv =visit(ctx.getChild(0)); + + @Override + public Node visitEvalFunctionStatement(DPLParser.EvalFunctionStatementContext ctx) { + Node rv = visit(ctx.getChild(0)); return rv; } /** - * evalCompareStatement : (decimalType|fieldType|stringType) - * (DEQ|EQ|LTE|GTE|LT|GT|NEQ|LIKE|Like) evalStatement ; + * evalCompareStatement : (decimalType|fieldType|stringType) (DEQ|EQ|LTE|GTE|LT|GT|NEQ|LIKE|Like) evalStatement ; **/ public Node visitL_evalStatement_evalCompareStatement(DPLParser.L_evalStatement_evalCompareStatementContext ctx) { Node rv = evalCompareStatementEmitCatalyst(ctx); @@ -181,29 +189,29 @@ public Node visitL_evalStatement_evalCompareStatement(DPLParser.L_evalStatement_ } /** - * evalCompareStatement : (decimalType|fieldType|stringType) - * (DEQ|EQ|LTE|GTE|LT|GT|NEQ|LIKE|Like) evalStatement ; + * evalCompareStatement : (decimalType|fieldType|stringType) (DEQ|EQ|LTE|GTE|LT|GT|NEQ|LIKE|Like) evalStatement ; **/ private Node evalCompareStatementEmitCatalyst(DPLParser.L_evalStatement_evalCompareStatementContext ctx) { Node rv = null; Column lCol = null; Column rCol = null; Node left = visit(ctx.getChild(0)); - if(left != null){ - lCol=((ColumnNode)left).getColumn(); + if (left != null) { + lCol = ((ColumnNode) left).getColumn(); } - Node right = visit(ctx.getChild(2)); - if(right != null){ - rCol=((ColumnNode)right).getColumn(); + Node right = visit(ctx.getChild(2)); + if (right != null) { + rCol = ((ColumnNode) right).getColumn(); } // Add operation between columns operation =,<,>,...... - Column col = addOperation(lCol, (TerminalNode)ctx.getChild(1), rCol); + Column col = addOperation(lCol, (TerminalNode) ctx.getChild(1), rCol); rv = new ColumnNode(col); return rv; } /** * Converts a {@link TerminalNode} containing an operation into a {@link Token} + * * @param operation TerminalNode of an operation * @return Token */ @@ -243,23 +251,25 @@ private Token getOperation(TerminalNode operation) { /** * Generates a column based on a source, value and operation - * @param source Left hand side + * + * @param source Left hand side * @param operation Operation - * @param value Right hand side + * @param value Right hand side * @return Final resulting column */ private Column addOperation(Column source, TerminalNode operation, Column value) { Column rv = null; - if (operation.getSymbol().getType() == DPLLexer.EVAL_LANGUAGE_MODE_LIKE) { - SparkSession ss = catCtx.getSparkSession(); - ss.udf().register("LikeComparison", new LikeComparison(), DataTypes.BooleanType); - rv = functions.callUDF("LikeComparison", source, value); - } else { - SparkSession ss = catCtx.getSparkSession(); - ss.udf().register("EvalOperation", new EvalOperation(), DataTypes.BooleanType); - rv = functions.callUDF("EvalOperation", source, functions.lit(operation.getSymbol().getType()), value); - } + if (operation.getSymbol().getType() == DPLLexer.EVAL_LANGUAGE_MODE_LIKE) { + SparkSession ss = catCtx.getSparkSession(); + ss.udf().register("LikeComparison", new LikeComparison(), DataTypes.BooleanType); + rv = functions.callUDF("LikeComparison", source, value); + } + else { + SparkSession ss = catCtx.getSparkSession(); + ss.udf().register("EvalOperation", new EvalOperation(), DataTypes.BooleanType); + rv = functions.callUDF("EvalOperation", source, functions.lit(operation.getSymbol().getType()), value); + } return rv; } @@ -272,17 +282,23 @@ private Node evalLogicStatementEmitCatalyst(DPLParser.L_evalStatement_evalLogicS Node rv = null; Column lCol = null; Column rCol = null; - LOGGER.debug("VisitEvalLogicStatement(Catalyst) incoming: children=<{}> text=<{}>",ctx.getChildCount(), ctx.getText()); - Node l = visit(ctx.getChild(0)); + LOGGER + .debug( + "VisitEvalLogicStatement(Catalyst) incoming: children=<{}> text=<{}>", ctx.getChildCount(), + ctx.getText() + ); + Node l = visit(ctx.getChild(0)); LOGGER.debug("VisitEvalLogicStatement(Catalyst) left: class=<{}>", l.getClass().getName()); if (ctx.getChildCount() == 1) { // leaf rv = l; - } else if (ctx.getChildCount() == 2) { + } + else if (ctx.getChildCount() == 2) { // Should not come here at all Node r = visit(ctx.getChild(1)); rv = r; - } else if (ctx.getChildCount() == 3) { + } + else if (ctx.getChildCount() == 3) { // logical operation xxx AND/OR/XOR xxx TerminalNode op = (TerminalNode) ctx.getChild(1); Token oper = null; @@ -297,18 +313,18 @@ private Node evalLogicStatementEmitCatalyst(DPLParser.L_evalStatement_evalLogicS Node r = visit(ctx.getChild(2)); LOGGER.debug("visitEvalLogicStatement(Catalyst) right=<{}>", r.getClass().getName()); - if(l instanceof ColumnNode && r instanceof ColumnNode){ - Column col=null; + if (l instanceof ColumnNode && r instanceof ColumnNode) { + Column col = null; Column lcol = ((ColumnNode) l).getColumn(); Column rcol = ((ColumnNode) r).getColumn(); if (op.getSymbol().getType() == DPLLexer.EVAL_LANGUAGE_MODE_AND) { - col=rcol.and(rcol); + col = rcol.and(rcol); } if (op.getSymbol().getType() == DPLLexer.EVAL_LANGUAGE_MODE_OR) { - col=lcol.or(rcol); + col = lcol.or(rcol); } LOGGER.debug("visitEvalLogicStatement(Catalyst) with oper=<{}>", col.expr().sql()); - rv=new ColumnNode(col); + rv = new ColumnNode(col); } } LOGGER.debug(" EvalLogicStatement(Catalyst) generated=<{}> class=<{}>", rv.toString(), rv.getClass().getName()); @@ -326,8 +342,7 @@ public Node visitFieldType(DPLParser.FieldTypeContext ctx) { } /** - * subEvalStatement : PARENTHESIS_L EvalStatement PARENTHESIS_R - * ; + * subEvalStatement : PARENTHESIS_L EvalStatement PARENTHESIS_R ; */ @Override public Node visitSubEvalStatement(DPLParser.SubEvalStatementContext ctx) { @@ -348,7 +363,7 @@ public Node visitT_eval_evalParameter(DPLParser.T_eval_evalParameterContext ctx) // Step initialization this.evalStep = new EvalStep(); this.evalStep.setLeftSide(field.toString()); // eval a = ... - this.evalStep.setRightSide(((ColumnNode)n).getColumn()); // ... = right side + this.evalStep.setRightSide(((ColumnNode) n).getColumn()); // ... = right side return new StepNode(this.evalStep); } @@ -376,18 +391,18 @@ private Node evalMethodIfEmitCatalyst(DPLParser.EvalMethodIfContext ctx) { ColumnNode ifFalse = (ColumnNode) visit(ctx.evalStatement(2)); // Register and call ifClause UDF - UserDefinedFunction ifClauseUdf = functions.udf(new IfClause(), DataTypes.createArrayType(DataTypes.StringType, true)); + UserDefinedFunction ifClauseUdf = functions + .udf(new IfClause(), DataTypes.createArrayType(DataTypes.StringType, true)); SparkSession ss = SparkSession.builder().getOrCreate(); ss.udf().register("ifClause", ifClauseUdf); - Column res = functions.callUDF("ifClause", - logical.getColumn(), ifTrue.getColumn(), ifFalse.getColumn()); + Column res = functions.callUDF("ifClause", logical.getColumn(), ifTrue.getColumn(), ifFalse.getColumn()); return new ColumnNode(res); } /** - * substring() eval method - * Takes a substring out of the given string based on given indices + * substring() eval method Takes a substring out of the given string based on given indices + * * @param ctx EvalMethodSubstrContext * @return column node containing substr column */ @@ -400,14 +415,15 @@ public Node visitEvalMethodSubstr(DPLParser.EvalMethodSubstrContext ctx) { private Node evalMethodSubstrEmitCatalyst(DPLParser.EvalMethodSubstrContext ctx) { ColumnNode rv; Column exp; - String par1=visit(ctx.getChild(2)).toString(); - String par2=visit(ctx.getChild(4)).toString(); + String par1 = visit(ctx.getChild(2)).toString(); + String par2 = visit(ctx.getChild(4)).toString(); // TODO: In spark >=3.5.0 change to use functions.substring() as it supports not // providing the length argument. if (ctx.evalStatement().size() > 2) { - String par3=visit(ctx.getChild(6)).toString(); + String par3 = visit(ctx.getChild(6)).toString(); exp = functions.expr(String.format("substring(%s, %s, %s)", par1, par2, par3)); - } else { + } + else { exp = functions.expr(String.format("substring(%s, %s)", par1, par2)); } @@ -416,8 +432,8 @@ private Node evalMethodSubstrEmitCatalyst(DPLParser.EvalMethodSubstrContext ctx) } /** - * true() eval method - * returns TRUE + * true() eval method returns TRUE + * * @param ctx EvalMethodTrueContext * @return column node */ @@ -427,15 +443,15 @@ public Node visitEvalMethodTrue(DPLParser.EvalMethodTrueContext ctx) { return rv; } - + private Node evalMethodTrueEmitCatalyst(DPLParser.EvalMethodTrueContext ctx) { Column col = functions.lit(true); return new ColumnNode(col); } /** - * false() eval method - * returns FALSE + * false() eval method returns FALSE + * * @param ctx EvalMethodFalseContext * @return column node */ @@ -451,30 +467,32 @@ private Node evalMethodFalseEmitCatalyst(DPLParser.EvalMethodFalseContext ctx) { } /** - * null() eval method - * Returns NULL + * null() eval method Returns NULL + * * @param ctx EvalMethodNullContext * @return column node */ - @Override public Node visitEvalMethodNull(DPLParser.EvalMethodNullContext ctx) { + @Override + public Node visitEvalMethodNull(DPLParser.EvalMethodNullContext ctx) { LOGGER.debug("Visit eval method null"); Node rv = evalMethodNullEmitCatalyst(ctx); return rv; } - + private Node evalMethodNullEmitCatalyst(DPLParser.EvalMethodNullContext ctx) { Column col = functions.lit(catCtx.nullValue.value()).cast(DataTypes.StringType); return new ColumnNode(col); } /** - * nullif() eval method - * Returns NULL if x==y, otherwise x + * nullif() eval method Returns NULL if x==y, otherwise x + * * @param ctx EvalMethodNullifContext * @return column node */ - @Override public Node visitEvalMethodNullif(DPLParser.EvalMethodNullifContext ctx) { + @Override + public Node visitEvalMethodNullif(DPLParser.EvalMethodNullifContext ctx) { Node rv = evalMethodNullifEmitCatalyst(ctx); return rv; @@ -488,7 +506,8 @@ private Node evalMethodNullifEmitCatalyst(DPLParser.EvalMethodNullifContext ctx) Column yCol = ((ColumnNode) visit(ctx.getChild(4))).getColumn(); // If x == y, return null - Column col = functions.when(xCol.equalTo(yCol), functions.lit(catCtx.nullValue.value()).cast(DataTypes.StringType)); + Column col = functions + .when(xCol.equalTo(yCol), functions.lit(catCtx.nullValue.value()).cast(DataTypes.StringType)); // otherwise, return x col = col.otherwise(xCol); @@ -497,8 +516,8 @@ private Node evalMethodNullifEmitCatalyst(DPLParser.EvalMethodNullifContext ctx) } /** - * searchmatch(x) eval method - * Returns TRUE if the search string matches the event + * searchmatch(x) eval method Returns TRUE if the search string matches the event + * * @param ctx EvalMethodSearchmatchContext * @return column node */ @@ -516,7 +535,7 @@ private Node evalMethodSearchmatchEmitCatalyst(DPLParser.EvalMethodSearchmatchCo // fields array should contain x=... , y=..., etc. String[] fields = searchStr.split(" "); - List columns = new ArrayList<>(); // list of all the Columns used in searchmatch + List columns = new ArrayList<>(); // list of all the Columns used in searchmatch for (String f : fields) { // Split to field and literal based on operator @@ -527,7 +546,8 @@ private Node evalMethodSearchmatchEmitCatalyst(DPLParser.EvalMethodSearchmatchCo String regexifiedString = "(?i)" + regexify(operands[0]); // (?i) to make it case-insensitive columns.add(functions.col("_raw").rlike(regexifiedString)); // field=rlike - } else { + } + else { Column field = functions.col(operands[0].trim()); Column literal = functions.lit(operands[1].trim()); String literalString = operands[1].trim(); @@ -536,13 +556,17 @@ private Node evalMethodSearchmatchEmitCatalyst(DPLParser.EvalMethodSearchmatchCo // Test if field equals/leq/geq/lt/gt to literal if (f.contains("<=")) { columns.add(field.leq(literal)); - } else if (f.contains(">=")) { + } + else if (f.contains(">=")) { columns.add(field.geq(literal)); - } else if (f.contains("<")) { + } + else if (f.contains("<")) { columns.add(field.lt(literal)); - } else if (f.contains("=")) { + } + else if (f.contains("=")) { columns.add(field.rlike(regexifiedString)); - } else if (f.contains(">")) { + } + else if (f.contains(">")) { columns.add(field.gt(literal)); } } @@ -564,10 +588,9 @@ private Node evalMethodSearchmatchEmitCatalyst(DPLParser.EvalMethodSearchmatchCo return new ColumnNode(res); } - /** - * now() eval method - * Returns the current system time + * now() eval method Returns the current system time + * * @param ctx EvalMethodNowContext * @return column node */ @@ -586,8 +609,8 @@ private Node evalMethodNowEmitCatalyst(DPLParser.EvalMethodNowContext ctx) { } /** - * len() eval method - * Returns the length of the field contents + * len() eval method Returns the length of the field contents + * * @param ctx EvalMethodLenContext * @return column node */ @@ -598,22 +621,23 @@ public Node visitEvalMethodLen(DPLParser.EvalMethodLenContext ctx) { public Node evalMethodLenEmitCatalyst(DPLParser.EvalMethodLenContext ctx) { ColumnNode rv; - String inField=visit(ctx.getChild(2)).toString(); + String inField = visit(ctx.getChild(2)).toString(); rv = new ColumnNode(functions.length(new Column(inField))); return rv; } /** - * lower() eval method - * Returns the field contents in all lowercase characters + * lower() eval method Returns the field contents in all lowercase characters + * * @param ctx EvalMethodLowerContext * @return column node */ - @Override public Node visitEvalMethodLower(DPLParser.EvalMethodLowerContext ctx) { + @Override + public Node visitEvalMethodLower(DPLParser.EvalMethodLowerContext ctx) { Node rv = evalMethodLowerEmitCatalyst(ctx); return rv; } - + private Node evalMethodLowerEmitCatalyst(DPLParser.EvalMethodLowerContext ctx) { Node rv = null; @@ -625,16 +649,17 @@ private Node evalMethodLowerEmitCatalyst(DPLParser.EvalMethodLowerContext ctx) { } /** - * upper() eval method - * Returns the field contents in all uppercase characters + * upper() eval method Returns the field contents in all uppercase characters + * * @param ctx EvalMethodUpperContext * @return ColumnNode containing column for upper() eval method */ - @Override public Node visitEvalMethodUpper(DPLParser.EvalMethodUpperContext ctx) { + @Override + public Node visitEvalMethodUpper(DPLParser.EvalMethodUpperContext ctx) { Node rv = evalMethodUpperEmitCatalyst(ctx); return rv; } - + private Node evalMethodUpperEmitCatalyst(DPLParser.EvalMethodUpperContext ctx) { Node rv = null; @@ -646,16 +671,18 @@ private Node evalMethodUpperEmitCatalyst(DPLParser.EvalMethodUpperContext ctx) { } /** - * urldecode() eval method - * Returns the given URL decoded, e.g. replaces %20 etc. with appropriate human readable characters + * urldecode() eval method Returns the given URL decoded, e.g. replaces %20 etc. with appropriate human readable + * characters + * * @param ctx EvalMethodUrldecodeContext * @return column node */ - @Override public Node visitEvalMethodUrldecode(DPLParser.EvalMethodUrldecodeContext ctx) { + @Override + public Node visitEvalMethodUrldecode(DPLParser.EvalMethodUrldecodeContext ctx) { Node rv = evalMethodUrldecodeEmitCatalyst(ctx); return rv; } - + private Node evalMethodUrldecodeEmitCatalyst(DPLParser.EvalMethodUrldecodeContext ctx) { Node rv = null; @@ -673,16 +700,18 @@ private Node evalMethodUrldecodeEmitCatalyst(DPLParser.EvalMethodUrldecodeContex } /** - * ltrim() eval method - * Returns the field contents with trimString trimmed from left side if given, otherwise spaces and tabs + * ltrim() eval method Returns the field contents with trimString trimmed from left side if given, otherwise spaces + * and tabs + * * @param ctx EvalMethodLtrimContext * @return ColumnNode containing column for trim() eval method */ - @Override public Node visitEvalMethodLtrim(DPLParser.EvalMethodLtrimContext ctx) { + @Override + public Node visitEvalMethodLtrim(DPLParser.EvalMethodLtrimContext ctx) { Node rv = evalMethodLtrimEmitCatalyst(ctx); return rv; } - + private Node evalMethodLtrimEmitCatalyst(DPLParser.EvalMethodLtrimContext ctx) { Node rv = null; @@ -703,16 +732,17 @@ private Node evalMethodLtrimEmitCatalyst(DPLParser.EvalMethodLtrimContext ctx) { } /** - * replace() eval method - * Returns a string with replaced parts as defined by the regex string + * replace() eval method Returns a string with replaced parts as defined by the regex string + * * @param ctx EvalMethodReplaceContext * @return column node */ - @Override public Node visitEvalMethodReplace(DPLParser.EvalMethodReplaceContext ctx) { + @Override + public Node visitEvalMethodReplace(DPLParser.EvalMethodReplaceContext ctx) { Node rv = evalMethodReplaceEmitCatalyst(ctx); return rv; } - + private Node evalMethodReplaceEmitCatalyst(DPLParser.EvalMethodReplaceContext ctx) { Node rv = null; @@ -720,7 +750,9 @@ private Node evalMethodReplaceEmitCatalyst(DPLParser.EvalMethodReplaceContext ct // replace ( x , y , z ) if (ctx.getChildCount() != 8) { - throw new UnsupportedOperationException("Eval method 'replace' requires three arguments: source string, regex string and substitute string."); + throw new UnsupportedOperationException( + "Eval method 'replace' requires three arguments: source string, regex string and substitute string." + ); } Column srcString = ((ColumnNode) visit(ctx.getChild(2))).getColumn(); @@ -733,18 +765,19 @@ private Node evalMethodReplaceEmitCatalyst(DPLParser.EvalMethodReplaceContext ct return rv; } - /** - * rtrim() eval method - * Returns the field contents with trimString trimmed from right side if given, otherwise spaces and tabs + * rtrim() eval method Returns the field contents with trimString trimmed from right side if given, otherwise spaces + * and tabs + * * @param ctx EvalMethodRtrimContext * @return ColumnNode containing column for rtrim() eval method */ - @Override public Node visitEvalMethodRtrim(DPLParser.EvalMethodRtrimContext ctx) { + @Override + public Node visitEvalMethodRtrim(DPLParser.EvalMethodRtrimContext ctx) { Node rv = evalMethodRtrimEmitCatalyst(ctx); return rv; } - + private Node evalMethodRtrimEmitCatalyst(DPLParser.EvalMethodRtrimContext ctx) { Node rv = null; @@ -765,16 +798,18 @@ private Node evalMethodRtrimEmitCatalyst(DPLParser.EvalMethodRtrimContext ctx) { } /** - * trim() eval method - * Returns the field contents with trimString trimmed from both sides if given, otherwise spaces and tabs + * trim() eval method Returns the field contents with trimString trimmed from both sides if given, otherwise spaces + * and tabs + * * @param ctx EvalMethodTrimContext * @return ColumnNode containing Column for trim() eval method */ - @Override public Node visitEvalMethodTrim(DPLParser.EvalMethodTrimContext ctx) { + @Override + public Node visitEvalMethodTrim(DPLParser.EvalMethodTrimContext ctx) { Node rv = evalMethodTrimEmitCatalyst(ctx); return rv; } - + private Node evalMethodTrimEmitCatalyst(DPLParser.EvalMethodTrimContext ctx) { Node rv = null; @@ -795,8 +830,8 @@ private Node evalMethodTrimEmitCatalyst(DPLParser.EvalMethodTrimContext ctx) { } /** - * split() eval method - * Returns field split with delimiter + * split() eval method Returns field split with delimiter + * * @param ctx EvalMethodSplitContext * @return ColumnNode containing the Column for split() eval method */ @@ -820,8 +855,8 @@ private Node evalMethodSplitEmitCatalyst(DPLParser.EvalMethodSplitContext ctx) { } /** - * relative_time() eval method - * Returns timestamp based on given unix epoch and relative time modifier + * relative_time() eval method Returns timestamp based on given unix epoch and relative time modifier + * * @param ctx EvalMethodRelative_timeContext * @return ColumnNode containing Column for relative_time() eval method */ @@ -849,8 +884,8 @@ private Node evalMethodRelative_timeEmitCatalyst(DPLParser.EvalMethodRelative_ti } /** - * strftime() eval method - * Returns a timestamp based on given unix epoch and format string + * strftime() eval method Returns a timestamp based on given unix epoch and format string + * * @param ctx EvalMethodStrftimeContext * @return ColumnNode containing column for strftime() eval method */ @@ -907,8 +942,8 @@ private Node evalMethodStrftimeEmitCatalyst(DPLParser.EvalMethodStrftimeContext } /** - * strptime() eval method - * Returns an unix epoch based on given timestamp and format string + * strptime() eval method Returns an unix epoch based on given timestamp and format string + * * @param ctx EvalMethodStrptimeContext * @return ColumnNode containing the column for strptime() eval method */ @@ -959,8 +994,8 @@ private Node evalMethodStrptimeEmitCatalyst(DPLParser.EvalMethodStrptimeContext } /** - * pow() eval method - * Returns the field to the power of n + * pow() eval method Returns the field to the power of n + * * @param ctx EvalMethodPowContext * @return ColumnNode for pow() eval method */ @@ -968,7 +1003,7 @@ public Node visitEvalMethodPow(DPLParser.EvalMethodPowContext ctx) { Node rv = evalMethodPowEmitCatalyst(ctx); return rv; } - + private Node evalMethodPowEmitCatalyst(DPLParser.EvalMethodPowContext ctx) { Node rv = null; // child 2 and 4 are x and y @@ -982,16 +1017,17 @@ private Node evalMethodPowEmitCatalyst(DPLParser.EvalMethodPowContext ctx) { } /** - * abs() eval method - * Returns absolute value + * abs() eval method Returns absolute value + * * @param ctx EvalMethodAbs * @return ColumnNode for abs() eval method */ - @Override public Node visitEvalMethodAbs(DPLParser.EvalMethodAbsContext ctx) { + @Override + public Node visitEvalMethodAbs(DPLParser.EvalMethodAbsContext ctx) { Node rv = evalMethodAbsEmitCatalyst(ctx); return rv; } - + private Node evalMethodAbsEmitCatalyst(DPLParser.EvalMethodAbsContext ctx) { Node rv = null; @@ -1004,16 +1040,17 @@ private Node evalMethodAbsEmitCatalyst(DPLParser.EvalMethodAbsContext ctx) { } /** - * ceiling() / ceil() eval method - * Returns the value rounded up + * ceiling() / ceil() eval method Returns the value rounded up + * * @param ctx EvalMethodCeiling * @return ColumnNode for ceiling() / ceil() eval method */ - @Override public Node visitEvalMethodCeiling(DPLParser.EvalMethodCeilingContext ctx) { + @Override + public Node visitEvalMethodCeiling(DPLParser.EvalMethodCeilingContext ctx) { Node rv = evalMethodCeilingEmitCatalyst(ctx); return rv; } - + private Node evalMethodCeilingEmitCatalyst(DPLParser.EvalMethodCeilingContext ctx) { Node rv = null; @@ -1026,9 +1063,9 @@ private Node evalMethodCeilingEmitCatalyst(DPLParser.EvalMethodCeilingContext ct } /** - * exact() eval method - * Acts as a passthrough - * More details: {@link #evalMethodExactEmitCatalyst(DPLParser.EvalMethodExactContext)} + * exact() eval method Acts as a passthrough More details: + * {@link #evalMethodExactEmitCatalyst(DPLParser.EvalMethodExactContext)} + * * @param ctx EvalMethodExactContext * @return ColumnNode containg Column for exact() eval method */ @@ -1056,16 +1093,17 @@ private Node evalMethodExactEmitCatalyst(DPLParser.EvalMethodExactContext ctx) { } /** - * exp() eval method - * Returns e^n + * exp() eval method Returns e^n + * * @param ctx EvalMethodExpContext * @return ColumnNode containing column for exp() eval method */ - @Override public Node visitEvalMethodExp(DPLParser.EvalMethodExpContext ctx) { + @Override + public Node visitEvalMethodExp(DPLParser.EvalMethodExpContext ctx) { Node rv = evalMethodExpEmitCatalyst(ctx); return rv; } - + private Node evalMethodExpEmitCatalyst(DPLParser.EvalMethodExpContext ctx) { Node rv = null; @@ -1078,16 +1116,17 @@ private Node evalMethodExpEmitCatalyst(DPLParser.EvalMethodExpContext ctx) { } /** - * floor() eval method - * Rounds down to nearest integer + * floor() eval method Rounds down to nearest integer + * * @param ctx EvalMethodFloor * @return ColumnNode containing column for floor() eval method */ - @Override public Node visitEvalMethodFloor(DPLParser.EvalMethodFloorContext ctx) { + @Override + public Node visitEvalMethodFloor(DPLParser.EvalMethodFloorContext ctx) { Node rv = evalMethodFloorEmitCatalyst(ctx); return rv; } - + private Node evalMethodFloorEmitCatalyst(DPLParser.EvalMethodFloorContext ctx) { Node rv = null; @@ -1100,16 +1139,17 @@ private Node evalMethodFloorEmitCatalyst(DPLParser.EvalMethodFloorContext ctx) { } /** - * ln() eval method - * Returns the natural logarithmic of n + * ln() eval method Returns the natural logarithmic of n + * * @param ctx EvalMethodLnContext * @return ColumnNode containing column for ln() eval method */ - @Override public Node visitEvalMethodLn(DPLParser.EvalMethodLnContext ctx) { + @Override + public Node visitEvalMethodLn(DPLParser.EvalMethodLnContext ctx) { Node rv = evalMethodLnEmitCatalyst(ctx); return rv; } - + private Node evalMethodLnEmitCatalyst(DPLParser.EvalMethodLnContext ctx) { Node rv = null; @@ -1122,16 +1162,17 @@ private Node evalMethodLnEmitCatalyst(DPLParser.EvalMethodLnContext ctx) { } /** - * log() eval method - * Returns the nth logarithmic of given number + * log() eval method Returns the nth logarithmic of given number + * * @param ctx EvalMethodLogContext * @return ColumnNode containing the column for log() eval method */ - @Override public Node visitEvalMethodLog(DPLParser.EvalMethodLogContext ctx) { + @Override + public Node visitEvalMethodLog(DPLParser.EvalMethodLogContext ctx) { Node rv = evalMethodLogEmitCatalyst(ctx); return rv; } - + private Node evalMethodLogEmitCatalyst(DPLParser.EvalMethodLogContext ctx) { Node rv; @@ -1146,10 +1187,12 @@ else if (ctx.evalStatement().size() == 2) { // num, base params numberCol = ((ColumnNode) visit(ctx.getChild(2))).getColumn(); base = Double.parseDouble(ctx.getChild(4).getText()); - } else { - throw new UnsupportedOperationException("Eval method 'log' supports two parameters: Number (required) and base (optional)."); } - + else { + throw new UnsupportedOperationException( + "Eval method 'log' supports two parameters: Number (required) and base (optional)." + ); + } Column res = functions.log(base, numberCol); rv = new ColumnNode(res); @@ -1157,8 +1200,8 @@ else if (ctx.evalStatement().size() == 2) { } /** - * cos() eval method - * Returns the cosine of the field value + * cos() eval method Returns the cosine of the field value + * * @param ctx EvalMethodCosContext * @return ColumnNode containing the column for cos() eval method */ @@ -1180,8 +1223,8 @@ private Node evalMethodCosEmitCatalyst(DPLParser.EvalMethodCosContext ctx) { } /** - * cosh() eval method - * Returns the hyperbolic cosine of the field value + * cosh() eval method Returns the hyperbolic cosine of the field value + * * @param ctx EvalMethodCoshContext * @return ColumnNode containing the column for the cosh() eval method */ @@ -1203,16 +1246,17 @@ private Node evalMethodCoshEmitCatalyst(DPLParser.EvalMethodCoshContext ctx) { } /** - * acos() eval method - * Returns arc cosine of the field value + * acos() eval method Returns arc cosine of the field value + * * @param ctx EvalMethodAcosContext * @return ColumnNode containing the column for acos() eval method */ - @Override public Node visitEvalMethodAcos(DPLParser.EvalMethodAcosContext ctx) { + @Override + public Node visitEvalMethodAcos(DPLParser.EvalMethodAcosContext ctx) { Node rv = evalMethodAcosEmitCatalyst(ctx); return rv; } - + private Node evalMethodAcosEmitCatalyst(DPLParser.EvalMethodAcosContext ctx) { Node rv = null; @@ -1225,8 +1269,8 @@ private Node evalMethodAcosEmitCatalyst(DPLParser.EvalMethodAcosContext ctx) { } /** - * acosh() eval method - * Returns the inverse hyperbolic cosine of the field value + * acosh() eval method Returns the inverse hyperbolic cosine of the field value + * * @param ctx EvalMethodAcoshContext * @return ColumnNode containing the column for acosh() eval method */ @@ -1253,8 +1297,8 @@ private Node evalMethodAcoshEmitCatalyst(DPLParser.EvalMethodAcoshContext ctx) { } /** - * sin() eval method - * Returns the sine of the field value + * sin() eval method Returns the sine of the field value + * * @param ctx EvalMethodSinContext * @return ColumnNode containing the Column for sin() eval method */ @@ -1276,8 +1320,8 @@ private Node evalMethodSinEmitCatalyst(DPLParser.EvalMethodSinContext ctx) { } /** - * sinh() eval method - * Returns hyperbolic sine of the field value + * sinh() eval method Returns hyperbolic sine of the field value + * * @param ctx EvalMethodSinhContext * @return ColumnNode containing the Column for the sinh() eval method */ @@ -1299,16 +1343,17 @@ private Node evalMethodSinhEmitCatalyst(DPLParser.EvalMethodSinhContext ctx) { } /** - * asin() eval method - * Returns arc sine of the field value + * asin() eval method Returns arc sine of the field value + * * @param ctx EvalMethodAsinContext * @return ColumnNode containing the Column for the asin() eval method */ - @Override public Node visitEvalMethodAsin(DPLParser.EvalMethodAsinContext ctx) { + @Override + public Node visitEvalMethodAsin(DPLParser.EvalMethodAsinContext ctx) { Node rv = evalMethodAsinEmitCatalyst(ctx); return rv; } - + private Node evalMethodAsinEmitCatalyst(DPLParser.EvalMethodAsinContext ctx) { Node rv = null; @@ -1321,8 +1366,8 @@ private Node evalMethodAsinEmitCatalyst(DPLParser.EvalMethodAsinContext ctx) { } /** - * asinh() eval method - * Returns inverse hyperbolic sine of the field value + * asinh() eval method Returns inverse hyperbolic sine of the field value + * * @param ctx EvalMethodAsinhContext * @return ColumnNode containing the Column for the asinh() eval method */ @@ -1348,8 +1393,8 @@ private Node evalMethodAsinhEmitCatalyst(DPLParser.EvalMethodAsinhContext ctx) { } /** - * tan() eval method - * Returns the tangent of the field value + * tan() eval method Returns the tangent of the field value + * * @param ctx EvalMethodTanContext * @return ColumnNode containing the Column for the tan() eval method */ @@ -1370,10 +1415,9 @@ private Node evalMethodTanEmitCatalyst(DPLParser.EvalMethodTanContext ctx) { return rv; } - /** - * tanh() eval method - * Returns the hyperbolic tangent of the field value + * tanh() eval method Returns the hyperbolic tangent of the field value + * * @param ctx EvalMethodTanhContext * @return ColumnNode containing the Column for the tanh() eval method */ @@ -1395,16 +1439,17 @@ private Node evalMethodTanhEmitCatalyst(DPLParser.EvalMethodTanhContext ctx) { } /** - * atan() eval method - * Returns the arc tangent of the field value + * atan() eval method Returns the arc tangent of the field value + * * @param ctx EvalMethodAtanContext * @return ColumnNode containing the Column for the atan() eval method */ - @Override public Node visitEvalMethodAtan(DPLParser.EvalMethodAtanContext ctx) { + @Override + public Node visitEvalMethodAtan(DPLParser.EvalMethodAtanContext ctx) { Node rv = evalMethodAtanEmitCatalyst(ctx); return rv; } - + private Node evalMethodAtanEmitCatalyst(DPLParser.EvalMethodAtanContext ctx) { Node rv = null; @@ -1417,8 +1462,8 @@ private Node evalMethodAtanEmitCatalyst(DPLParser.EvalMethodAtanContext ctx) { } /** - * atan2() eval method - * Returns the arc tangent of Y,X + * atan2() eval method Returns the arc tangent of Y,X + * * @param ctx EvalMethodAtan2Context * @return ColumnNode containg the Column for the atan2() eval method */ @@ -1441,8 +1486,8 @@ private Node evalMethodAtan2EmitCatalyst(DPLParser.EvalMethodAtan2Context ctx) { } /** - * atanh() eval method - * Returns the inverse hyperbolic tangent of the field value + * atanh() eval method Returns the inverse hyperbolic tangent of the field value + * * @param ctx EvalMethodAtanhContext * @return ColumnNode containing the Column for the atanh() eval method */ @@ -1469,8 +1514,9 @@ private Node evalMethodAtanhEmitCatalyst(DPLParser.EvalMethodAtanhContext ctx) { } /** - * avg() eval method - * Returns the average of all numerical parameters given as an integer. Ignores parameters that can't be converted to a number. + * avg() eval method Returns the average of all numerical parameters given as an integer. Ignores parameters that + * can't be converted to a number. + * * @param ctx EvalMethodAvgContext * @return ColumnNode containing the Column for the avg() eval method */ @@ -1484,11 +1530,9 @@ public Node visitEvalMethodAvg(DPLParser.EvalMethodAvgContext ctx) { Column isNumber = number.cast(DataTypes.DoubleType).isNotNull(); // only sum numerical values, because the value would result to a null otherwise - sum = functions.when(isNumber, sum.plus(number.cast(DataTypes.DoubleType))) - .otherwise(sum); + sum = functions.when(isNumber, sum.plus(number.cast(DataTypes.DoubleType))).otherwise(sum); - amountSummed = functions.when(isNumber, amountSummed.plus(1)) - .otherwise(amountSummed); + amountSummed = functions.when(isNumber, amountSummed.plus(1)).otherwise(amountSummed); } Column average = functions.round(sum.divide(amountSummed)).cast(DataTypes.IntegerType); @@ -1497,8 +1541,8 @@ public Node visitEvalMethodAvg(DPLParser.EvalMethodAvgContext ctx) { } /** - * hypot() eval method - * Returns the hypotenuse, when X and Y are the edges forming the 90 degree angle of a triangle + * hypot() eval method Returns the hypotenuse, when X and Y are the edges forming the 90 degree angle of a triangle + * * @param ctx EvalMethodHypotContext * @return ColumnNode containing the Column for the hypot() eval method */ @@ -1521,16 +1565,17 @@ private Node evalMethodHypotEmitCatalyst(DPLParser.EvalMethodHypotContext ctx) { } /** - * pi() eval method - * Returns constant pi to 11 digits of precision + * pi() eval method Returns constant pi to 11 digits of precision + * * @param ctx EvalMethodPiContext * @return ColumnNode containing the Column for the pi() eval method */ - @Override public Node visitEvalMethodPi(DPLParser.EvalMethodPiContext ctx) { + @Override + public Node visitEvalMethodPi(DPLParser.EvalMethodPiContext ctx) { Node rv = evalMethodPiEmitCatalyst(ctx); return rv; } - + private Node evalMethodPiEmitCatalyst(DPLParser.EvalMethodPiContext ctx) { Node rv = null; @@ -1543,8 +1588,8 @@ private Node evalMethodPiEmitCatalyst(DPLParser.EvalMethodPiContext ctx) { } /** - * min() eval method - * Returns the minimum of the given arguments + * min() eval method Returns the minimum of the given arguments + * * @param ctx EvalMethodMinContext * @return ColumnNode containing the Column for the min() eval method */ @@ -1559,7 +1604,7 @@ private Node evalMethodMinEmitCatalyst(DPLParser.EvalMethodMinContext ctx) { // min ( x0 , x1 , x2 , ... , xn ) List listOfColumns = new ArrayList<>(); - for (int i = 2; i <= ctx.getChildCount()-2; i = i + 2) { + for (int i = 2; i <= ctx.getChildCount() - 2; i = i + 2) { listOfColumns.add(((ColumnNode) visit(ctx.getChild(i))).getColumn()); } @@ -1578,8 +1623,8 @@ private Node evalMethodMinEmitCatalyst(DPLParser.EvalMethodMinContext ctx) { } /** - * max() eval method - * Returns the maximum of the given arguments + * max() eval method Returns the maximum of the given arguments + * * @param ctx EvalMethodMaxContext * @return ColumnNode containing the Column for the max() eval method */ @@ -1594,7 +1639,7 @@ private Node evalMethodMaxEmitCatalyst(DPLParser.EvalMethodMaxContext ctx) { // max ( x0 , x1 , x2 , ... , xn ) List listOfColumns = new ArrayList<>(); - for (int i = 2; i <= ctx.getChildCount()-2; i = i + 2) { + for (int i = 2; i <= ctx.getChildCount() - 2; i = i + 2) { listOfColumns.add(((ColumnNode) visit(ctx.getChild(i))).getColumn()); } @@ -1613,16 +1658,17 @@ private Node evalMethodMaxEmitCatalyst(DPLParser.EvalMethodMaxContext ctx) { } /** - * random() eval method - * Returns a pseudo-random integer from range 0 to 2^31 - 1 + * random() eval method Returns a pseudo-random integer from range 0 to 2^31 - 1 + * * @param ctx EvalMethodRandomContext * @return ColumnNode containing the Column for the random() eval method */ - @Override public Node visitEvalMethodRandom(DPLParser.EvalMethodRandomContext ctx) { + @Override + public Node visitEvalMethodRandom(DPLParser.EvalMethodRandomContext ctx) { Node rv = evalMethodRandomEmitCatalyst(ctx); return rv; } - + private Node evalMethodRandomEmitCatalyst(DPLParser.EvalMethodRandomContext ctx) { Node rv = null; @@ -1638,16 +1684,17 @@ private Node evalMethodRandomEmitCatalyst(DPLParser.EvalMethodRandomContext ctx) } /** - * sqrt() eval method - * Returns the square root of the field value + * sqrt() eval method Returns the square root of the field value + * * @param ctx EvalMethodSqrtContext * @return ColumnNode containing the Column for the sqrt() eval method */ - @Override public Node visitEvalMethodSqrt(DPLParser.EvalMethodSqrtContext ctx) { + @Override + public Node visitEvalMethodSqrt(DPLParser.EvalMethodSqrtContext ctx) { Node rv = evalMethodSqrtEmitCatalyst(ctx); return rv; } - + private Node evalMethodSqrtEmitCatalyst(DPLParser.EvalMethodSqrtContext ctx) { Node rv = null; @@ -1660,8 +1707,8 @@ private Node evalMethodSqrtEmitCatalyst(DPLParser.EvalMethodSqrtContext ctx) { } /** - * sum() eval method - * Returns the sum of the given numerical values/fields + * sum() eval method Returns the sum of the given numerical values/fields + * * @param ctx EvalmethodSumContext * @return ColumnNode containing the Column for the sum() eval method */ @@ -1673,23 +1720,23 @@ public Node visitEvalMethodSum(DPLParser.EvalMethodSumContext ctx) { Column isDouble = number.cast(DataTypes.DoubleType).isNotNull(); // only sum numerical values, because the value would result to a null otherwise - sum = functions.when(isDouble, sum.plus(number)) - .otherwise(sum); + sum = functions.when(isDouble, sum.plus(number)).otherwise(sum); } return new ColumnNode(sum); } /** - * round() eval method - * Returns x rounded to y decimal places, or integer if y missing + * round() eval method Returns x rounded to y decimal places, or integer if y missing + * * @param ctx EvalMethodRoundContext * @return ColumnNode containing the Column for the round() eval method */ - @Override public Node visitEvalMethodRound(DPLParser.EvalMethodRoundContext ctx) { + @Override + public Node visitEvalMethodRound(DPLParser.EvalMethodRoundContext ctx) { Node rv = evalMethodRoundEmitCatalyst(ctx); return rv; } - + private Node evalMethodRoundEmitCatalyst(DPLParser.EvalMethodRoundContext ctx) { Node rv = null; @@ -1708,8 +1755,8 @@ private Node evalMethodRoundEmitCatalyst(DPLParser.EvalMethodRoundContext ctx) { } /** - * sigfig() eval method - * Returns the field value reduced to the significant figures + * sigfig() eval method Returns the field value reduced to the significant figures + * * @param ctx EvalMethodSigfigContext * @return ColumnNode containing the Column for the sigfig() eval method */ @@ -1726,7 +1773,6 @@ private Node evalMethodSigfigEmitCatalyst(DPLParser.EvalMethodSigfigContext ctx) // * / result should have minimum number of significant figures of all of the operands // + - result should have the same amount of sigfigs as the least precise number of all of the operands - // This column contains the result of the calculation Column calculation = ((ColumnNode) visit(ctx.getChild(2))).getColumn(); // This is the original input given by the user @@ -1746,7 +1792,8 @@ private Node evalMethodSigfigEmitCatalyst(DPLParser.EvalMethodSigfigContext ctx) if (!matcher.matches()) { // Not numeric, add to listOfCols as col listOfCols.add(functions.col(operands[i])); - } else { + } + else { // Numeric, add as lit listOfCols.add(functions.lit(operands[i])); } @@ -1771,8 +1818,8 @@ private Node evalMethodSigfigEmitCatalyst(DPLParser.EvalMethodSigfigContext ctx) } /** - * case() eval method - * Alternating conditions and values, returns the first value where condition is true + * case() eval method Alternating conditions and values, returns the first value where condition is true + * * @param ctx EvalMethodCaseContext * @return ColumnNode containing the Column for the case() eval method */ @@ -1780,14 +1827,16 @@ public Node visitEvalMethodCase(DPLParser.EvalMethodCaseContext ctx) { Node rv = evalMethodCaseEmitCatalyst(ctx); return rv; } - + private Node evalMethodCaseEmitCatalyst(DPLParser.EvalMethodCaseContext ctx) { Node rv = null; // case ( x , y , x2 , y2 , x3, y3 , ... ) if (ctx.getChildCount() % 2 != 0) { - throw new UnsupportedOperationException("The amount of arguments given was invalid. Make sure each condition has a matching value given as an argument."); + throw new UnsupportedOperationException( + "The amount of arguments given was invalid. Make sure each condition has a matching value given as an argument." + ); } Column condition = null; @@ -1800,7 +1849,7 @@ private Node evalMethodCaseEmitCatalyst(DPLParser.EvalMethodCaseContext ctx) { continue; condition = ((ColumnNode) visit(ctx.getChild(i))).getColumn(); - value = ((ColumnNode) visit(ctx.getChild(i+2))).getColumn(); + value = ((ColumnNode) visit(ctx.getChild(i + 2))).getColumn(); // Skip to i=i+2, so the value doesn't get read as a condition i = i + 2; @@ -1816,25 +1865,27 @@ private Node evalMethodCaseEmitCatalyst(DPLParser.EvalMethodCaseContext ctx) { return rv; } - /** - * validate() eval method - * Opposite of 'case(x,y)', returns the first y where x=false + * validate() eval method Opposite of 'case(x,y)', returns the first y where x=false + * * @param ctx EvalMethodValidateContext * @return ColumnNode containing the Column for the validate() eval method */ - @Override public Node visitEvalMethodValidate(DPLParser.EvalMethodValidateContext ctx) { + @Override + public Node visitEvalMethodValidate(DPLParser.EvalMethodValidateContext ctx) { Node rv = evalMethodValidateEmitCatalyst(ctx); return rv; } - + private Node evalMethodValidateEmitCatalyst(DPLParser.EvalMethodValidateContext ctx) { Node rv = null; // validate ( x , y , x2 , y2 , x3, y3 , ... ) if (ctx.getChildCount() % 2 != 0) { - throw new UnsupportedOperationException("The amount of arguments given was invalid. Make sure each condition has a matching value given as an argument."); + throw new UnsupportedOperationException( + "The amount of arguments given was invalid. Make sure each condition has a matching value given as an argument." + ); } Column condition = null; @@ -1847,7 +1898,7 @@ private Node evalMethodValidateEmitCatalyst(DPLParser.EvalMethodValidateContext continue; condition = ((ColumnNode) visit(ctx.getChild(i))).getColumn(); - value = ((ColumnNode) visit(ctx.getChild(i+2))).getColumn(); + value = ((ColumnNode) visit(ctx.getChild(i + 2))).getColumn(); // Skip to i=i+2, so the value doesn't get read as a condition i = i + 2; @@ -1863,18 +1914,18 @@ private Node evalMethodValidateEmitCatalyst(DPLParser.EvalMethodValidateContext return rv; } - /** - * cidrmatch() eval method - * x= cidr subnet, y= ip address to match with the subnet x + * cidrmatch() eval method x= cidr subnet, y= ip address to match with the subnet x + * * @param ctx EvalMethodCidrmatchContext * @return ColumnNode containing the Column for the cidrmatch() eval method */ - @Override public Node visitEvalMethodCidrmatch(DPLParser.EvalMethodCidrmatchContext ctx) { + @Override + public Node visitEvalMethodCidrmatch(DPLParser.EvalMethodCidrmatchContext ctx) { Node rv = evalMethodCidrmatchEmitCatalyst(ctx); return rv; } - + private Node evalMethodCidrmatchEmitCatalyst(DPLParser.EvalMethodCidrmatchContext ctx) { Node rv = null; @@ -1894,8 +1945,8 @@ private Node evalMethodCidrmatchEmitCatalyst(DPLParser.EvalMethodCidrmatchContex } /** - * coalesce() eval method - * Returns the first non-null argument + * coalesce() eval method Returns the first non-null argument + * * @param ctx EvalMethodCoalesceContext * @return ColumnNode containing the Column for the coalesce() eval method */ @@ -1903,7 +1954,7 @@ public Node visitEvalMethodCoalesce(DPLParser.EvalMethodCoalesceContext ctx) { Node rv = evalMethodCoalesceEmitCatalyst(ctx); return rv; } - + // coalesce ( x , x2 , x3 , ... ) private Node evalMethodCoalesceEmitCatalyst(DPLParser.EvalMethodCoalesceContext ctx) { Node rv = null; @@ -1913,7 +1964,7 @@ private Node evalMethodCoalesceEmitCatalyst(DPLParser.EvalMethodCoalesceContext Column res = null; // Skip all the non-interesting bits (commas, parenthesis) with the for loop itself - for (int i = 2; i <= ctx.getChildCount()-2; i = i + 2) { + for (int i = 2; i <= ctx.getChildCount() - 2; i = i + 2) { ColumnNode currentItemNode = ((ColumnNode) visit(ctx.getChild(i))); Column currentItem = currentItemNode.getColumn(); @@ -1927,16 +1978,17 @@ private Node evalMethodCoalesceEmitCatalyst(DPLParser.EvalMethodCoalesceContext } /** - * in() eval method - * Returns if the first column's value is any of the other arguments + * in() eval method Returns if the first column's value is any of the other arguments + * * @param ctx EvalMethodInContext * @return ColumnNode containing the Column for the in() eval method */ - @Override public Node visitEvalMethodIn(DPLParser.EvalMethodInContext ctx) { + @Override + public Node visitEvalMethodIn(DPLParser.EvalMethodInContext ctx) { Node rv = evalMethodInEmitCatalyst(ctx); return rv; } - + private Node evalMethodInEmitCatalyst(DPLParser.EvalMethodInContext ctx) { Node rv = null; @@ -1945,7 +1997,7 @@ private Node evalMethodInEmitCatalyst(DPLParser.EvalMethodInContext ctx) { // Rest of the arguments are values, and are processed as strings List valueList = new ArrayList<>(); - for (int i = 4; i < ctx.getChildCount() - 1; i=i+2) { + for (int i = 4; i < ctx.getChildCount() - 1; i = i + 2) { String value = ctx.getChild(i).getText(); valueList.add(new UnquotedText(new TextString(value)).read()); } @@ -1958,17 +2010,18 @@ private Node evalMethodInEmitCatalyst(DPLParser.EvalMethodInContext ctx) { } /** - * like() eval method - * Returns TRUE if field is like pattern - * Pattern supports wildcards % (multi char) and _ (single char) + * like() eval method Returns TRUE if field is like pattern Pattern supports wildcards % (multi char) and _ (single + * char) + * * @param ctx EvalMethodLikeContext * @return ColumnNode containing the Column for the like() eval method */ - @Override public Node visitEvalMethodLike(DPLParser.EvalMethodLikeContext ctx) { + @Override + public Node visitEvalMethodLike(DPLParser.EvalMethodLikeContext ctx) { Node rv = evalMethodLikeEmitCatalyst(ctx); return rv; } - + private Node evalMethodLikeEmitCatalyst(DPLParser.EvalMethodLikeContext ctx) { Node rv = null; @@ -1981,18 +2034,18 @@ private Node evalMethodLikeEmitCatalyst(DPLParser.EvalMethodLikeContext ctx) { return rv; } - /** - * match() eval method - * Returns true if regex matches the subject + * match() eval method Returns true if regex matches the subject + * * @param ctx EvalMethodMatchContext * @return ColumnNode containing the Column for the match() eval method */ - @Override public Node visitEvalMethodMatch(DPLParser.EvalMethodMatchContext ctx) { + @Override + public Node visitEvalMethodMatch(DPLParser.EvalMethodMatchContext ctx) { Node rv = evalMethodMatchEmitCatalyst(ctx); return rv; } - + private Node evalMethodMatchEmitCatalyst(DPLParser.EvalMethodMatchContext ctx) { Node rv = null; @@ -2013,18 +2066,18 @@ private Node evalMethodMatchEmitCatalyst(DPLParser.EvalMethodMatchContext ctx) { return rv; } - /** - * tostring() eval method - * Returns different types of strings based on given second argument + * tostring() eval method Returns different types of strings based on given second argument + * * @param ctx EvalMethodTostringContext * @return ColumnNode containing the Column for the tostring() eval method */ - @Override public Node visitEvalMethodTostring(DPLParser.EvalMethodTostringContext ctx) { + @Override + public Node visitEvalMethodTostring(DPLParser.EvalMethodTostringContext ctx) { Node rv = evalMethodTostringEmitCatalyst(ctx); return rv; } - + private Node evalMethodTostringEmitCatalyst(DPLParser.EvalMethodTostringContext ctx) { Node rv = null; @@ -2038,7 +2091,8 @@ private Node evalMethodTostringEmitCatalyst(DPLParser.EvalMethodTostringContext Column inputCol = ((ColumnNode) visit(ctx.getChild(2))).getColumn(); String options = null; - if (ctx.getChildCount() > 4) options = new UnquotedText(new TextString(ctx.getChild(4).getText())).read(); + if (ctx.getChildCount() > 4) + options = new UnquotedText(new TextString(ctx.getChild(4).getText())).read(); Column col = null; // Base case without options (Y) @@ -2060,7 +2114,10 @@ private Node evalMethodTostringEmitCatalyst(DPLParser.EvalMethodTostringContext col = functions.from_unixtime(inputCol, "HH:mm:ss"); break; default: - throw new UnsupportedOperationException("Unsupported optional argument supplied: '" + options + "'.The argument must be 'hex', 'commas' or 'duration' instead."); + throw new UnsupportedOperationException( + "Unsupported optional argument supplied: '" + options + + "'.The argument must be 'hex', 'commas' or 'duration' instead." + ); } } @@ -2068,18 +2125,18 @@ private Node evalMethodTostringEmitCatalyst(DPLParser.EvalMethodTostringContext return rv; } - /** - * tonumber() eval method - * Returns number string converted to given base, defaults to base-10 + * tonumber() eval method Returns number string converted to given base, defaults to base-10 + * * @param ctx EvalMethodTonumberContext * @return ColumnNode containing the Column for the tonumber() eval method */ - @Override public Node visitEvalMethodTonumber(DPLParser.EvalMethodTonumberContext ctx) { + @Override + public Node visitEvalMethodTonumber(DPLParser.EvalMethodTonumberContext ctx) { Node rv = evalMethodTonumberEmitCatalyst(ctx); return rv; } - + private Node evalMethodTonumberEmitCatalyst(DPLParser.EvalMethodTonumberContext ctx) { Node rv = null; @@ -2103,18 +2160,18 @@ private Node evalMethodTonumberEmitCatalyst(DPLParser.EvalMethodTonumberContext return rv; } - /** - * md5() eval method - * Returns the md5 checksum of given field + * md5() eval method Returns the md5 checksum of given field + * * @param ctx EvalMethodMd5Context * @return ColumnNode containing the Column for the md5() eval method */ - @Override public Node visitEvalMethodMd5(DPLParser.EvalMethodMd5Context ctx) { + @Override + public Node visitEvalMethodMd5(DPLParser.EvalMethodMd5Context ctx) { Node rv = evalMethodMd5EmitCatalyst(ctx); return rv; } - + private Node evalMethodMd5EmitCatalyst(DPLParser.EvalMethodMd5Context ctx) { Node rv = null; @@ -2127,16 +2184,17 @@ private Node evalMethodMd5EmitCatalyst(DPLParser.EvalMethodMd5Context ctx) { } /** - * sha1() eval method - * Returns the sha1 checksum of given field + * sha1() eval method Returns the sha1 checksum of given field + * * @param ctx EvalMethodSha1Context * @return ColumnNode containing the Column for the sha1() eval method */ - @Override public Node visitEvalMethodSha1(DPLParser.EvalMethodSha1Context ctx) { + @Override + public Node visitEvalMethodSha1(DPLParser.EvalMethodSha1Context ctx) { Node rv = evalMethodSha1EmitCatalyst(ctx); return rv; } - + private Node evalMethodSha1EmitCatalyst(DPLParser.EvalMethodSha1Context ctx) { Node rv = null; @@ -2149,16 +2207,17 @@ private Node evalMethodSha1EmitCatalyst(DPLParser.EvalMethodSha1Context ctx) { } /** - * sha256() eval method - * Returns the sha256 checksum of given field + * sha256() eval method Returns the sha256 checksum of given field + * * @param ctx EvalMethodSha256Context * @return ColumnNode containing the Column for the sha256() eval method */ - @Override public Node visitEvalMethodSha256(DPLParser.EvalMethodSha256Context ctx) { + @Override + public Node visitEvalMethodSha256(DPLParser.EvalMethodSha256Context ctx) { Node rv = evalMethodSha256EmitCatalyst(ctx); return rv; } - + private Node evalMethodSha256EmitCatalyst(DPLParser.EvalMethodSha256Context ctx) { Node rv = null; @@ -2171,16 +2230,17 @@ private Node evalMethodSha256EmitCatalyst(DPLParser.EvalMethodSha256Context ctx) } /** - * sha512() eval method - * Returns the sha512 checksum of given field + * sha512() eval method Returns the sha512 checksum of given field + * * @param ctx EvalMethodSha512Context * @return ColumnNode containing the column for the sha512() eval method */ - @Override public Node visitEvalMethodSha512(DPLParser.EvalMethodSha512Context ctx) { + @Override + public Node visitEvalMethodSha512(DPLParser.EvalMethodSha512Context ctx) { Node rv = evalMethodSha512EmitCatalyst(ctx); return rv; } - + private Node evalMethodSha512EmitCatalyst(DPLParser.EvalMethodSha512Context ctx) { Node rv = null; @@ -2193,16 +2253,17 @@ private Node evalMethodSha512EmitCatalyst(DPLParser.EvalMethodSha512Context ctx) } /** - * isbool() eval method - * Returns whether or not the field value is a boolean + * isbool() eval method Returns whether or not the field value is a boolean + * * @param ctx EvalMethodIsboolContext * @return ColumnNode containing the Column for the isbool() eval method */ - @Override public Node visitEvalMethodIsbool(DPLParser.EvalMethodIsboolContext ctx) { + @Override + public Node visitEvalMethodIsbool(DPLParser.EvalMethodIsboolContext ctx) { Node rv = evalMethodIsboolEmitCatalyst(ctx); return rv; } - + private Node evalMethodIsboolEmitCatalyst(DPLParser.EvalMethodIsboolContext ctx) { Node rv = null; @@ -2220,16 +2281,17 @@ private Node evalMethodIsboolEmitCatalyst(DPLParser.EvalMethodIsboolContext ctx) } /** - * isint() eval method - * Returns whether or not the field value is an integer + * isint() eval method Returns whether or not the field value is an integer + * * @param ctx EvalMethodIsintContext * @return ColumnNode containing the Column for the isint() eval method */ - @Override public Node visitEvalMethodIsint(DPLParser.EvalMethodIsintContext ctx) { + @Override + public Node visitEvalMethodIsint(DPLParser.EvalMethodIsintContext ctx) { Node rv = evalMethodIsintEmitCatalyst(ctx); return rv; } - + private Node evalMethodIsintEmitCatalyst(DPLParser.EvalMethodIsintContext ctx) { Node rv = null; @@ -2247,16 +2309,17 @@ private Node evalMethodIsintEmitCatalyst(DPLParser.EvalMethodIsintContext ctx) { } /** - * isnum() eval method - * Returns whether or not the field value is a numeric + * isnum() eval method Returns whether or not the field value is a numeric + * * @param ctx EvalMethodIsnumContext * @return ColumnNode containing the Column for the isnum() eval method */ - @Override public Node visitEvalMethodIsnum(DPLParser.EvalMethodIsnumContext ctx) { + @Override + public Node visitEvalMethodIsnum(DPLParser.EvalMethodIsnumContext ctx) { Node rv = evalMethodIsnumEmitCatalyst(ctx); return rv; } - + private Node evalMethodIsnumEmitCatalyst(DPLParser.EvalMethodIsnumContext ctx) { Node rv = null; @@ -2274,16 +2337,17 @@ private Node evalMethodIsnumEmitCatalyst(DPLParser.EvalMethodIsnumContext ctx) { } /** - * isstr() eval method - * Returns whether or not the field value is a string + * isstr() eval method Returns whether or not the field value is a string + * * @param ctx EvalMethodIsstrContext * @return ColumnNode containing the Column for the isstr() eval method */ - @Override public Node visitEvalMethodIsstr(DPLParser.EvalMethodIsstrContext ctx) { + @Override + public Node visitEvalMethodIsstr(DPLParser.EvalMethodIsstrContext ctx) { Node rv = evalMethodIsstrEmitCatalyst(ctx); return rv; } - + private Node evalMethodIsstrEmitCatalyst(DPLParser.EvalMethodIsstrContext ctx) { Node rv = null; @@ -2301,16 +2365,17 @@ private Node evalMethodIsstrEmitCatalyst(DPLParser.EvalMethodIsstrContext ctx) { } /** - * typeof() eval method - * Returns the type of the field + * typeof() eval method Returns the type of the field + * * @param ctx EvalMethodTypeofContext * @return ColumnNode containing the Column for the typeof() eval method */ - @Override public Node visitEvalMethodTypeof(DPLParser.EvalMethodTypeofContext ctx) { + @Override + public Node visitEvalMethodTypeof(DPLParser.EvalMethodTypeofContext ctx) { Node rv = evalMethodTypeofEmitCatalyst(ctx); return rv; } - + private Node evalMethodTypeofEmitCatalyst(DPLParser.EvalMethodTypeofContext ctx) { Node rv = null; @@ -2328,16 +2393,17 @@ private Node evalMethodTypeofEmitCatalyst(DPLParser.EvalMethodTypeofContext ctx) } /** - * isnull() eval method - * Returns whether or not the field value is a null + * isnull() eval method Returns whether or not the field value is a null + * * @param ctx EvalMethodIsnullContext * @return ColumnNode containing the Column for the isnull() eval method */ - @Override public Node visitEvalMethodIsnull(DPLParser.EvalMethodIsnullContext ctx) { + @Override + public Node visitEvalMethodIsnull(DPLParser.EvalMethodIsnullContext ctx) { Node rv = evalMethodIsnullEmitCatalyst(ctx); return rv; } - + private Node evalMethodIsnullEmitCatalyst(DPLParser.EvalMethodIsnullContext ctx) { Node rv = null; @@ -2349,16 +2415,17 @@ private Node evalMethodIsnullEmitCatalyst(DPLParser.EvalMethodIsnullContext ctx) } /** - * isnotnull() eval method - * Returns whether or not the field value is a non-null + * isnotnull() eval method Returns whether or not the field value is a non-null + * * @param ctx EvalMethodIsnotnullContext * @return ColumnNode containing the Column for the isnotnull() eval method */ - @Override public Node visitEvalMethodIsnotnull(DPLParser.EvalMethodIsnotnullContext ctx) { + @Override + public Node visitEvalMethodIsnotnull(DPLParser.EvalMethodIsnotnullContext ctx) { Node rv = evalMethodIsnotnullEmitCatalyst(ctx); return rv; } - + private Node evalMethodIsnotnullEmitCatalyst(DPLParser.EvalMethodIsnotnullContext ctx) { Node rv = null; @@ -2370,8 +2437,8 @@ private Node evalMethodIsnotnullEmitCatalyst(DPLParser.EvalMethodIsnotnullContex } /** - * commands() eval method - * Returns the commands used in given search string + * commands() eval method Returns the commands used in given search string + * * @param ctx EvalMethodCommandsContext * @return ColumnNode containing the Column for the commands() eval method */ @@ -2387,7 +2454,8 @@ private Node evalMethodCommandsEmitCatalyst(DPLParser.EvalMethodCommandsContext Column searchString = ((ColumnNode) visit(ctx.getChild(2))).getColumn(); // Register and call UDF Commands - UserDefinedFunction CommandsUDF = functions.udf(new Commands(), DataTypes.createArrayType(DataTypes.StringType, false)); + UserDefinedFunction CommandsUDF = functions + .udf(new Commands(), DataTypes.createArrayType(DataTypes.StringType, false)); SparkSession ss = SparkSession.builder().getOrCreate(); ss.udf().register("CommandsUDF", CommandsUDF); @@ -2398,8 +2466,8 @@ private Node evalMethodCommandsEmitCatalyst(DPLParser.EvalMethodCommandsContext } /** - * mvappend() eval method - * Returns a multivalue field with all arguments as values + * mvappend() eval method Returns a multivalue field with all arguments as values + * * @param ctx EvalMethodMvappendContext * @return ColumnNode containing the Column for the mvappend() eval method */ @@ -2418,7 +2486,8 @@ private Node evalMethodMvappendEmitCatalyst(DPLParser.EvalMethodMvappendContext List listOfFields = new ArrayList<>(); for (int i = 2; i <= ctx.getChildCount() - 2; i = i + 2) { Column field = ((ColumnNode) visit(ctx.getChild(i))).getColumn(); - if (field != null) listOfFields.add(field); + if (field != null) + listOfFields.add(field); } Column res = functions.array(JavaConversions.asScalaBuffer(listOfFields)); @@ -2428,8 +2497,8 @@ private Node evalMethodMvappendEmitCatalyst(DPLParser.EvalMethodMvappendContext } /** - * mvcount() eval method - * Returns the amount of items in the multivalue field + * mvcount() eval method Returns the amount of items in the multivalue field + * * @param ctx EvalMethodMvcountContext * @return ColumnNode containing the Column for the mvcount() eval method */ @@ -2451,7 +2520,8 @@ private Node evalMethodMvcountEmitCatalyst(DPLParser.EvalMethodMvcountContext ct Column sizeCol = functions.size(mvfield); // Return null if empty, otherwise return what functions.size() returns - Column res = functions.when(sizeCol.notEqual(functions.lit(0)), sizeCol) + Column res = functions + .when(sizeCol.notEqual(functions.lit(0)), sizeCol) .otherwise(functions.lit(catCtx.nullValue.value()).cast(DataTypes.StringType)); rv = new ColumnNode(res); @@ -2459,8 +2529,8 @@ private Node evalMethodMvcountEmitCatalyst(DPLParser.EvalMethodMvcountContext ct } /** - * mvdedup() eval method - * Returns the given multivalue field with deduplicated values + * mvdedup() eval method Returns the given multivalue field with deduplicated values + * * @param ctx EvalMethodMvdedupContext * @return ColumnNode containing the column for the mvdedup() eval method */ @@ -2477,7 +2547,8 @@ private Node evalMethodMvdedupEmitCatalyst(DPLParser.EvalMethodMvdedupContext ct Column mvfield = ((ColumnNode) visit(ctx.getChild(2))).getColumn(); // Call and register dedup udf - UserDefinedFunction mvDedupUDF = functions.udf(new Mvdedup(), DataTypes.createArrayType(DataTypes.StringType, false)); + UserDefinedFunction mvDedupUDF = functions + .udf(new Mvdedup(), DataTypes.createArrayType(DataTypes.StringType, false)); SparkSession ss = SparkSession.builder().getOrCreate(); ss.udf().register("mvDedupUDF", mvDedupUDF); @@ -2488,9 +2559,9 @@ private Node evalMethodMvdedupEmitCatalyst(DPLParser.EvalMethodMvdedupContext ct } /** - * mvfilter() eval method - * Returns the values in a multivalue field that pass the given regex filter - * TODO Implement, requires? work on the parser side + * mvfilter() eval method Returns the values in a multivalue field that pass the given regex filter TODO Implement, + * requires? work on the parser side + * * @param ctx EvalMethodMvfilterContext * @return ColumnNode containing Column for the mvfilter() eval method */ @@ -2523,14 +2594,13 @@ private Node evalMethodMvfilterEmitCatalyst(DPLParser.EvalMethodMvfilterContext Column res = functions.when(booleanExp.equalTo(functions.lit(true)), booleanExpField); res = functions.array(res); - rv = new ColumnNode(res); return rv; } /** - * mvfind() eval method - * Returns the values that match the given regex in the multivalue field provided + * mvfind() eval method Returns the values that match the given regex in the multivalue field provided + * * @param ctx EvalMethodMvfindContext * @return ColumnNode containing the Column for the mvfind() eval method */ @@ -2552,7 +2622,8 @@ private Node evalMethodMvfindEmitCatalyst(DPLParser.EvalMethodMvfindContext ctx) // Register and use UDF regexMatch with multivalue flag set to true boolean isMultiValue = true; - UserDefinedFunction regexMatch = functions.udf(new RegexMatch(isMultiValue, catCtx.nullValue), DataTypes.IntegerType); + UserDefinedFunction regexMatch = functions + .udf(new RegexMatch(isMultiValue, catCtx.nullValue), DataTypes.IntegerType); SparkSession ss = SparkSession.builder().getOrCreate(); ss.udf().register("regexMatch", regexMatch); @@ -2563,8 +2634,8 @@ private Node evalMethodMvfindEmitCatalyst(DPLParser.EvalMethodMvfindContext ctx) } /** - * mvindex() eval method - * Returns the values of the multivalue field between the given indices + * mvindex() eval method Returns the values of the multivalue field between the given indices + * * @param ctx EvalMethodMvindexContext * @return ColumnNode containing Column for the mvindex() eval method */ @@ -2596,20 +2667,21 @@ private Node evalMethodMvindexEmitCatalyst(DPLParser.EvalMethodMvindexContext ct } // Register and use UDF - UserDefinedFunction mvIndexUDF = functions.udf(new Mvindex(), DataTypes.createArrayType(DataTypes.StringType, false)); + UserDefinedFunction mvIndexUDF = functions + .udf(new Mvindex(), DataTypes.createArrayType(DataTypes.StringType, false)); SparkSession ss = SparkSession.builder().getOrCreate(); ss.udf().register("mvIndexUDF", mvIndexUDF); - Column res = functions.callUDF("mvIndexUDF", mvField, startIndex, - endIndex == null ? functions.lit(-1) : endIndex, functions.lit(endIndex != null)); + Column res = functions + .callUDF("mvIndexUDF", mvField, startIndex, endIndex == null ? functions.lit(-1) : endIndex, functions.lit(endIndex != null)); rv = new ColumnNode(res); return rv; } /** - * mvjoin() eval method - * Returns the multivalue field's items concatenated with given delimiter in between each value + * mvjoin() eval method Returns the multivalue field's items concatenated with given delimiter in between each value + * * @param ctx EvalMethodMvjoinContext * @return ColumnNode containing Column for the mvjoin() eval method */ @@ -2642,8 +2714,8 @@ private Node evalMethodMvjoinEmitCatalyst(DPLParser.EvalMethodMvjoinContext ctx) } /** - * mvrange() eval method - * Returns a multivalue field with numbers from start to end with step. + * mvrange() eval method Returns a multivalue field with numbers from start to end with step. + * * @param ctx EvalMethodMvrangeContext * @return ColumnNode containing Column for mvrange() eval method */ @@ -2662,7 +2734,8 @@ private Node evalMethodMvrangeEmitCatalyst(DPLParser.EvalMethodMvrangeContext ct Column step = ((ColumnNode) visit(ctx.getChild(6))).getColumn(); // Register and call UDF Mvrange - UserDefinedFunction MvrangeUDF = functions.udf(new Mvrange(), DataTypes.createArrayType(DataTypes.StringType, false)); + UserDefinedFunction MvrangeUDF = functions + .udf(new Mvrange(), DataTypes.createArrayType(DataTypes.StringType, false)); SparkSession ss = SparkSession.builder().getOrCreate(); ss.udf().register("MvrangeUDF", MvrangeUDF); @@ -2673,8 +2746,8 @@ private Node evalMethodMvrangeEmitCatalyst(DPLParser.EvalMethodMvrangeContext ct } /** - * mvsort() eval method - * Returns the given multivalue field sorted + * mvsort() eval method Returns the given multivalue field sorted + * * @param ctx EvalMethodMvsortContext * @return ColumnNode containing Column for mvsort() eval method */ @@ -2695,8 +2768,8 @@ private Node evalMethodMvsortEmitCatalyst(DPLParser.EvalMethodMvsortContext ctx) } /** - * mvzip() eval method - * Returns the two multivalue field's values "zipped" together, optionally with a delimiter + * mvzip() eval method Returns the two multivalue field's values "zipped" together, optionally with a delimiter + * * @param ctx EvalMethodMvzipContext * @return ColumnNode containing Column for mvzip() eval method */ @@ -2721,19 +2794,21 @@ private Node evalMethodMvzipEmitCatalyst(DPLParser.EvalMethodMvzipContext ctx) { // Register and call UDF Mvzip // Spark built-in function arrays_zip() exists, but it does not support specifying the delimiter - UserDefinedFunction MvzipUDF = functions.udf(new Mvzip(), DataTypes.createArrayType(DataTypes.StringType, false)); + UserDefinedFunction MvzipUDF = functions + .udf(new Mvzip(), DataTypes.createArrayType(DataTypes.StringType, false)); SparkSession ss = SparkSession.builder().getOrCreate(); ss.udf().register("MvzipUDF", MvzipUDF); - Column res = functions.callUDF("MvzipUDF", mvfield1, mvfield2, delimiter != null ? delimiter : functions.lit(",")); + Column res = functions + .callUDF("MvzipUDF", mvfield1, mvfield2, delimiter != null ? delimiter : functions.lit(",")); rv = new ColumnNode(res); return rv; } /** - * JSONValid() eval method - * Returns whether or not the given field contains valid json + * JSONValid() eval method Returns whether or not the given field contains valid json + * * @param ctx EvalMethodJSONValidContext * @return ColumnNode containing Column for JSONValid() eval method */ @@ -2760,8 +2835,8 @@ private Node evalMethodJSONValidEmitCatalyst(DPLParser.EvalMethodJSONValidContex } /** - * spath() eval method - * Processes the spath/xpath expression and returns the results + * spath() eval method Processes the spath/xpath expression and returns the results + * * @param ctx EvalMethodSpathContext * @return ColumnNode containing Column for spath() eval method */ @@ -2779,7 +2854,8 @@ private Node evalMethodSpathEmitCatalyst(DPLParser.EvalMethodSpathContext ctx) { Column spathExpr = ((ColumnNode) visit(ctx.getChild(4))).getColumn(); // Register and call UDF Spath - UserDefinedFunction SpathUDF = functions.udf(new Spath(catCtx.nullValue), DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType)); + UserDefinedFunction SpathUDF = functions + .udf(new Spath(catCtx.nullValue), DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType)); SparkSession ss = SparkSession.builder().getOrCreate(); ss.udf().register("SpathUDF", SpathUDF); @@ -2790,8 +2866,8 @@ private Node evalMethodSpathEmitCatalyst(DPLParser.EvalMethodSpathContext ctx) { } /** - * time() eval method - * Returns the current time in seconds + * time() eval method Returns the current time in seconds + * * @param ctx EvalMethodTimeContext * @return ColumnNode containing Column for time() eval method */ @@ -2806,7 +2882,8 @@ private Node evalMethodTimeEmitCatalyst(DPLParser.EvalMethodTimeContext ctx) { // Get current time in seconds, and add nanoseconds converted to seconds on top Instant now = Instant.now(); - double currentTimeInSecs = ((double)now.getEpochSecond()) + ((((double)now.getNano() / 1000d) / 1000d) / 1000d); + double currentTimeInSecs = ((double) now.getEpochSecond()) + + ((((double) now.getNano() / 1000d) / 1000d) / 1000d); // Known formatting type DecimalFormat df = new DecimalFormat("0.000000"); @@ -2815,7 +2892,7 @@ private Node evalMethodTimeEmitCatalyst(DPLParser.EvalMethodTimeContext ctx) { rv = new ColumnNode(res); return rv; } - + // // -- Eval types (field, integer, etc.) -- // @@ -2882,12 +2959,16 @@ public Node visitEvalStringType(DPLParser.EvalStringTypeContext ctx) { } @Override - public Node visitL_evalStatement_evalCalculateStatement_multipliers(DPLParser.L_evalStatement_evalCalculateStatement_multipliersContext ctx) { + public Node visitL_evalStatement_evalCalculateStatement_multipliers( + DPLParser.L_evalStatement_evalCalculateStatement_multipliersContext ctx + ) { return evalCalculateStatementEmitCatalyst(ctx); } @Override - public Node visitL_evalStatement_evalCalculateStatement_minus_plus(DPLParser.L_evalStatement_evalCalculateStatement_minus_plusContext ctx) { + public Node visitL_evalStatement_evalCalculateStatement_minus_plus( + DPLParser.L_evalStatement_evalCalculateStatement_minus_plusContext ctx + ) { return evalCalculateStatementEmitCatalyst(ctx); } @@ -2902,63 +2983,65 @@ public Node evalCalculateStatementEmitCatalyst(ParserRuleContext ctx) { return new ColumnNode(res); /* Column res = null; - switch (opNode.getSymbol().getType()) { - case DPLLexer.EVAL_LANGUAGE_MODE_PLUS: // '+' - Column plus = leftSide.plus(rightSide); - res = plus; - break; - case DPLLexer.EVAL_LANGUAGE_MODE_MINUS: // '-' - res = leftSide.minus(rightSide); - break; - case DPLLexer.EVAL_LANGUAGE_MODE_WILDCARD: // '*' - res = leftSide.multiply(rightSide); - break; - case DPLLexer.EVAL_LANGUAGE_MODE_SLASH: // '/' - res = leftSide.divide(rightSide); - break; - case DPLLexer.EVAL_LANGUAGE_MODE_PERCENT: // '%' - res = leftSide.mod(rightSide); - break; - default: - throw new UnsupportedOperationException("Unknown EvalCalculateStatement operation: " + opNode.getText()); - } - - return new ColumnNode(res);*/ - } - - @Override - public Node visitL_evalStatement_evalConcatenateStatement(DPLParser.L_evalStatement_evalConcatenateStatementContext ctx) { + switch (opNode.getSymbol().getType()) { + case DPLLexer.EVAL_LANGUAGE_MODE_PLUS: // '+' + Column plus = leftSide.plus(rightSide); + res = plus; + break; + case DPLLexer.EVAL_LANGUAGE_MODE_MINUS: // '-' + res = leftSide.minus(rightSide); + break; + case DPLLexer.EVAL_LANGUAGE_MODE_WILDCARD: // '*' + res = leftSide.multiply(rightSide); + break; + case DPLLexer.EVAL_LANGUAGE_MODE_SLASH: // '/' + res = leftSide.divide(rightSide); + break; + case DPLLexer.EVAL_LANGUAGE_MODE_PERCENT: // '%' + res = leftSide.mod(rightSide); + break; + default: + throw new UnsupportedOperationException("Unknown EvalCalculateStatement operation: " + opNode.getText()); + } + + return new ColumnNode(res);*/ + } + + @Override + public Node visitL_evalStatement_evalConcatenateStatement( + DPLParser.L_evalStatement_evalConcatenateStatementContext ctx + ) { throw new UnsupportedOperationException("evalConcatenateStatement not supported yet"); - /* was as bellow, in SQL mode + /* was as bellow, in SQL mode if (doc != null) { - throw new RuntimeException("evalConcatenateStatementEmitSql not implemented yet: " + ctx.getText()); -// return null; + throw new RuntimeException("evalConcatenateStatementEmitSql not implemented yet: " + ctx.getText()); + // return null; } return evalConcatenateStatementEmitSql(ctx); - - - public StringNode evalConcatenateStatementEmitSql(DPLParser.L_evalStatement_evalConcatenateStatementContext ctx) { + + + public StringNode evalConcatenateStatementEmitSql(DPLParser.L_evalStatement_evalConcatenateStatementContext ctx) { String sql = null; boolean useConcat = false; - + Node left = visit(ctx.getChild(0)); TerminalNode operation = (TerminalNode) ctx.getChild(1); Node right = visit(ctx.getChild(2)); - + String leftOperand = left.toString(); String rightOperand = right.toString(); // check whether operand name exist in symbol-table and replace it if (symbolTable.get(leftOperand) != null) { - leftOperand = symbolTable.get(leftOperand); + leftOperand = symbolTable.get(leftOperand); } if (symbolTable.get(rightOperand) != null) { - rightOperand = symbolTable.get(rightOperand); + rightOperand = symbolTable.get(rightOperand); } sql = "CONCAT(" + leftOperand + ", " + rightOperand + ")"; StringNode sNode = new StringNode(new Token(Type.STRING, sql)); return sNode; - } - */ + } + */ } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Cidrmatch.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Cidrmatch.java index 5511014..0b99ede 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Cidrmatch.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Cidrmatch.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import com.teragrep.pth10.ast.TextString; @@ -55,96 +54,100 @@ import java.net.UnknownHostException; /** - * User Defined Function for command cidrmatch(ip, subnet)

+ * User Defined Function for command cidrmatch(ip, subnet)
+ *
* Assumes that both ip and subnet are in String format, and returns a boolean.
* Returns TRUE, if the ip belongs to the subnet given. Otherwise returns FALSE.
- * Expects subnet to be in CIDR form. Like 192.168.1.1/24. Where the number after / is the netmask length in bits.

- * + * Expects subnet to be in CIDR form. Like 192.168.1.1/24. Where the number after / is the netmask length in bits.
+ *
* Netmask length explanation:
* Until 255.x.x.x netmask=8
* Until 255.255.x.x netmask=16
* Until 255.255.255.x netmask=24
* Until 255.255.255.255 netmask=32
+ * * @author eemhu - * */ public class Cidrmatch implements UDF2, Serializable { - private static final long serialVersionUID = 1L; - - @Override - public Boolean call(String ip, String subnet) throws Exception { - - // Strip quotes, if any - subnet = new UnquotedText(new TextString(subnet)).read(); - ip = new UnquotedText(new TextString(ip)).read(); - - int nMaskBits; - InetAddress requiredAdd = null; - - // Check for subnet mask length in bits (given after '/' character) - if (subnet.indexOf('/') > 0) { - String[] addWithMask = subnet.split("/"); - - subnet = addWithMask[0]; - nMaskBits = Integer.parseInt(addWithMask[1]); - } - else { - // Set to -1 if subnet mask length was not given - nMaskBits = -1; - } - - // Convert subnet string to InetAddress object - try { - requiredAdd = InetAddress.getByName(subnet); - } - catch (UnknownHostException e) { - throw new RuntimeException("Cidrmatch could not convert subnet string to InetAddress object. Check that the string is a valid IP address, like 192.168.1.1."); - } - - // Convert ip string to InetAddress object - InetAddress remoteAdd = null; - try { - remoteAdd = InetAddress.getByName(ip); - } - catch (UnknownHostException e) { - throw new RuntimeException("Cidrmatch could not convert IP string to InetAddress object. Check that the string is a valid IP address, like 192.168.1.1."); - } - - // Check that both are valid InetAddress objects - if (!requiredAdd.getClass().equals(remoteAdd.getClass())) { - return false; - } - - // If no subnet mask was found, do a direct comparison. - if (nMaskBits < 0) { - boolean isSame = remoteAdd == requiredAdd; - - return isSame; - } - - // Subnet mask was given, check against given mask - byte[] remAddr = remoteAdd.getAddress(); - byte[] reqAddr = requiredAdd.getAddress(); - - int nMaskFullBytes = nMaskBits / 8; - - byte finalByte = (byte) (0xFF00 >> (nMaskBits & 0x07)); - - for (int i = 0; i < nMaskFullBytes; i++) { - if (remAddr[i] != reqAddr[i]) { - return false; - } - } - - if (finalByte != 0) { - boolean isSame = (remAddr[nMaskFullBytes] & finalByte) == (reqAddr[nMaskFullBytes] & finalByte); - - return isSame; - } - - return true; - } - - + private static final long serialVersionUID = 1L; + + @Override + public Boolean call(String ip, String subnet) throws Exception { + + // Strip quotes, if any + subnet = new UnquotedText(new TextString(subnet)).read(); + ip = new UnquotedText(new TextString(ip)).read(); + + int nMaskBits; + InetAddress requiredAdd = null; + + // Check for subnet mask length in bits (given after '/' character) + if (subnet.indexOf('/') > 0) { + String[] addWithMask = subnet.split("/"); + + subnet = addWithMask[0]; + nMaskBits = Integer.parseInt(addWithMask[1]); + } + else { + // Set to -1 if subnet mask length was not given + nMaskBits = -1; + } + + // Convert subnet string to InetAddress object + try { + requiredAdd = InetAddress.getByName(subnet); + } + catch (UnknownHostException e) { + throw new RuntimeException( + "Cidrmatch could not convert subnet string to InetAddress object. Check that the string is a valid IP address, like 192.168.1.1." + ); + } + + // Convert ip string to InetAddress object + InetAddress remoteAdd = null; + try { + remoteAdd = InetAddress.getByName(ip); + } + catch (UnknownHostException e) { + throw new RuntimeException( + "Cidrmatch could not convert IP string to InetAddress object. Check that the string is a valid IP address, like 192.168.1.1." + ); + } + + // Check that both are valid InetAddress objects + if (!requiredAdd.getClass().equals(remoteAdd.getClass())) { + return false; + } + + // If no subnet mask was found, do a direct comparison. + if (nMaskBits < 0) { + boolean isSame = remoteAdd == requiredAdd; + + return isSame; + } + + // Subnet mask was given, check against given mask + byte[] remAddr = remoteAdd.getAddress(); + byte[] reqAddr = requiredAdd.getAddress(); + + int nMaskFullBytes = nMaskBits / 8; + + byte finalByte = (byte) (0xFF00 >> (nMaskBits & 0x07)); + + for (int i = 0; i < nMaskFullBytes; i++) { + if (remAddr[i] != reqAddr[i]) { + return false; + } + } + + if (finalByte != 0) { + boolean isSame = (remAddr[nMaskFullBytes] & finalByte) == (reqAddr[nMaskFullBytes] & finalByte); + + return isSame; + } + + return true; + } + } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Commands.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Commands.java index b5bb91b..8c42768 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Commands.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Commands.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF1; @@ -61,24 +60,24 @@ */ public class Commands implements UDF1>, Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; + + @Override + public List call(String searchStr) throws Exception { + List rv = new ArrayList<>(); + + // Running this command should result in following input->output: + // example: "search foo | stats count | sort count" -> search, stats, sort + + // should result in "search foo", "stats count", "sort count" + String[] cmds = searchStr.split("\\|"); + + // remove unnecessary whitespace and create mv field (Spark ArrayType) + for (int i = 0; i < cmds.length; i++) { + String currentCmd = cmds[i].trim(); + rv.add(currentCmd.substring(0, currentCmd.indexOf(' '))); + } - @Override - public List call(String searchStr) throws Exception { - List rv = new ArrayList<>(); - - // Running this command should result in following input->output: - // example: "search foo | stats count | sort count" -> search, stats, sort - - // should result in "search foo", "stats count", "sort count" - String[] cmds = searchStr.split("\\|"); - - // remove unnecessary whitespace and create mv field (Spark ArrayType) - for (int i = 0; i < cmds.length; i++) { - String currentCmd = cmds[i].trim(); - rv.add(currentCmd.substring(0, currentCmd.indexOf(' '))); - } - - return rv; - } + return rv; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/EvalArithmetic.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/EvalArithmetic.java index e986b08..bd8a293 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/EvalArithmetic.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/EvalArithmetic.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import com.teragrep.pth10.steps.ParsedResult; @@ -54,10 +53,11 @@ import java.math.RoundingMode; /** - * Checks if left and right side are longs/doubles, and performs basic arithmetic on them if they are, - * otherwise concatenate them as strings + * Checks if left and right side are longs/doubles, and performs basic arithmetic on them if they are, otherwise + * concatenate them as strings */ public class EvalArithmetic implements UDF3 { + @Override public String call(Object l, String op, Object r) throws Exception { // try long @@ -69,14 +69,17 @@ public String call(Object l, String op, Object r) throws Exception { if (left.getType() == ParsedResult.Type.STRING || right.getType() == ParsedResult.Type.STRING) { if (op.equals("+")) { return l.toString().concat(r.toString()); - } else { + } + else { throw new IllegalArgumentException("Eval arithmetics only allow Strings for the + operator."); } } // change left and right numbers into BigDecimal - BigDecimal leftNumber = left.getType() == ParsedResult.Type.DOUBLE ? BigDecimal.valueOf(left.getDouble()) : BigDecimal.valueOf(left.getLong()); - BigDecimal rightNumber = right.getType() == ParsedResult.Type.DOUBLE ? BigDecimal.valueOf(right.getDouble()) : BigDecimal.valueOf(right.getLong()); + BigDecimal leftNumber = left.getType() == ParsedResult.Type.DOUBLE ? BigDecimal + .valueOf(left.getDouble()) : BigDecimal.valueOf(left.getLong()); + BigDecimal rightNumber = right.getType() == ParsedResult.Type.DOUBLE ? BigDecimal + .valueOf(right.getDouble()) : BigDecimal.valueOf(right.getLong()); switch (op) { case "+": @@ -88,7 +91,8 @@ public String call(Object l, String op, Object r) throws Exception { case "/": try { return leftNumber.divide(rightNumber).stripTrailingZeros().toPlainString(); - } catch (ArithmeticException e) { + } + catch (ArithmeticException e) { // show 7 first decimals if the result of the division is a repeating number return leftNumber.divide(rightNumber, 7, RoundingMode.HALF_UP).toPlainString(); } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/EvalOperation.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/EvalOperation.java index da62ad7..0eea56a 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/EvalOperation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/EvalOperation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import com.teragrep.pth10.steps.ParsedResult; @@ -58,6 +57,7 @@ * UDF for comparing two fields. */ public class EvalOperation implements UDF3 { + @Override public Boolean call(Object l, Integer operationType, Object r) throws Exception { // Parse in case a number has been set to a String @@ -72,8 +72,12 @@ public Boolean call(Object l, Integer operationType, Object r) throws Exception boolean rv; // Only two numbers or two strings allowed. Throw error if mixed. - if ((leftType == ParsedResult.Type.STRING && (rightType == ParsedResult.Type.DOUBLE || rightType == ParsedResult.Type.LONG)) - || (rightType == ParsedResult.Type.STRING && (leftType == ParsedResult.Type.DOUBLE || leftType == ParsedResult.Type.LONG))) { + if ( + (leftType == ParsedResult.Type.STRING + && (rightType == ParsedResult.Type.DOUBLE || rightType == ParsedResult.Type.LONG)) + || (rightType == ParsedResult.Type.STRING + && (leftType == ParsedResult.Type.DOUBLE || leftType == ParsedResult.Type.LONG)) + ) { throw new IllegalArgumentException("Eval comparisons only allow using two numbers or two strings."); } // If both are Strings @@ -113,10 +117,15 @@ else if (leftType == ParsedResult.Type.STRING && rightType == ParsedResult.Type. } } // If both are numbers - else if ((leftType == ParsedResult.Type.DOUBLE || leftType == ParsedResult.Type.LONG) && (rightType == ParsedResult.Type.DOUBLE || rightType == ParsedResult.Type.LONG)) { + else if ( + (leftType == ParsedResult.Type.DOUBLE || leftType == ParsedResult.Type.LONG) + && (rightType == ParsedResult.Type.DOUBLE || rightType == ParsedResult.Type.LONG) + ) { // change left and right numbers into BigDecimal - BigDecimal leftNumber = left.getType() == ParsedResult.Type.DOUBLE ? BigDecimal.valueOf(left.getDouble()) : BigDecimal.valueOf(left.getLong()); - BigDecimal rightNumber = right.getType() == ParsedResult.Type.DOUBLE ? BigDecimal.valueOf(right.getDouble()) : BigDecimal.valueOf(right.getLong()); + BigDecimal leftNumber = left.getType() == ParsedResult.Type.DOUBLE ? BigDecimal + .valueOf(left.getDouble()) : BigDecimal.valueOf(left.getLong()); + BigDecimal rightNumber = right.getType() == ParsedResult.Type.DOUBLE ? BigDecimal + .valueOf(right.getDouble()) : BigDecimal.valueOf(right.getLong()); switch (operationType) { case DPLLexer.EVAL_LANGUAGE_MODE_EQ: @@ -148,14 +157,18 @@ else if ((leftType == ParsedResult.Type.DOUBLE || leftType == ParsedResult.Type. throw new RuntimeException("EvalStatement: Unknown operation in EvalOperation"); } } - } else if (leftType == ParsedResult.Type.LIST && rightType == ParsedResult.Type.LIST) { + } + else if (leftType == ParsedResult.Type.LIST && rightType == ParsedResult.Type.LIST) { // both lists; check internal elements to match List leftList = left.getList(); List rightList = right.getList(); rv = leftList.equals(rightList); - } else if ((leftType == ParsedResult.Type.LIST && rightType == ParsedResult.Type.STRING) || - (leftType == ParsedResult.Type.STRING && rightType == ParsedResult.Type.LIST)) { + } + else if ( + (leftType == ParsedResult.Type.LIST && rightType == ParsedResult.Type.STRING) + || (leftType == ParsedResult.Type.STRING && rightType == ParsedResult.Type.LIST) + ) { // one is list, other string // in this case check if list contains string List list; @@ -163,13 +176,15 @@ else if ((leftType == ParsedResult.Type.DOUBLE || leftType == ParsedResult.Type. if (leftType == ParsedResult.Type.LIST) { list = left.getList(); str = right.getString(); - } else { + } + else { list = right.getList(); str = left.getString(); } rv = list.contains(str); - } else { + } + else { throw new IllegalArgumentException("Eval comparison: Unsupported datatype detected"); } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/IfClause.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/IfClause.java index 51512c6..50d2d5d 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/IfClause.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/IfClause.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -58,13 +58,15 @@ public class IfClause implements UDF3> { public List call(Boolean comparisonResult, Object ifTrue, Object ifFalse) throws Exception { if (comparisonResult) { return objectToList(ifTrue); - } else { + } + else { return objectToList(ifFalse); } } /** * Converts an object to a singleton list or a list. + * * @param o LIST object or any object * @return full list if the object was a list, otherwise singleton list */ @@ -73,8 +75,9 @@ private List objectToList(Object o) { ParsedResult pr = typeParser.parse(o); if (pr.getType() == ParsedResult.Type.LIST) { return pr.getList(); - } else { + } + else { return Collections.singletonList(o.toString()); } } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/InverseHyperbolicFunction.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/InverseHyperbolicFunction.java index e2f472a..c43f6c9 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/InverseHyperbolicFunction.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/InverseHyperbolicFunction.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF1; @@ -53,76 +52,75 @@ /** * UDF for acosh(x), asinh(x) and atanh(x)
* Spark built-in functions exist in version {@literal >=3.1.0} + * * @author eemhu - * */ public class InverseHyperbolicFunction implements UDF1, Serializable { - private static final long serialVersionUID = 1L; - private String function = "acosh"; - - public InverseHyperbolicFunction(String function) { - super(); - this.function = function; - } + private static final long serialVersionUID = 1L; + private String function = "acosh"; + + public InverseHyperbolicFunction(String function) { + super(); + this.function = function; + } + + @Override + public Double call(Object x) throws Exception { + // Take x in as Object, so it can be multiple different types + // instead of making one UDF for each possible type + Double xAsDouble = convertObjectToDouble(x); + + switch (function) { + case "acosh": { + // Use Apache Commons function for acosh + org.apache.commons.math3.analysis.function.Acosh acoshFunction = new org.apache.commons.math3.analysis.function.Acosh(); + + return acoshFunction.value(xAsDouble); + } + case "asinh": { + org.apache.commons.math3.analysis.function.Asinh asinhFunction = new org.apache.commons.math3.analysis.function.Asinh(); + + return asinhFunction.value(xAsDouble); + } + case "atanh": { + org.apache.commons.math3.analysis.function.Atanh atanhFunction = new org.apache.commons.math3.analysis.function.Atanh(); + + return atanhFunction.value(xAsDouble); + } + default: { + throw new RuntimeException("Invalid inverse hyperbolic function: " + function); + } + } + + } + + private Double convertObjectToDouble(Object x) { + Double xAsDouble = null; - @Override - public Double call(Object x) throws Exception { - // Take x in as Object, so it can be multiple different types - // instead of making one UDF for each possible type - Double xAsDouble = convertObjectToDouble(x); - - switch (function) { - case "acosh": { - // Use Apache Commons function for acosh - org.apache.commons.math3.analysis.function.Acosh acoshFunction = - new org.apache.commons.math3.analysis.function.Acosh(); + if (x instanceof Long) { + xAsDouble = ((Long) x).doubleValue(); + } + else if (x instanceof Integer) { + xAsDouble = ((Integer) x).doubleValue(); + } + else if (x instanceof Double) { + xAsDouble = (Double) x; + } + else if (x instanceof Float) { + xAsDouble = ((Float) x).doubleValue(); + } + else if (x instanceof String) { + xAsDouble = Double.valueOf((String) x); + } + else { + throw new RuntimeException( + this.function + + " input value couldn't be converted to Double. Expected Long, Integer, Double, Float or String." + ); + } - return acoshFunction.value(xAsDouble); - } - case "asinh": { - org.apache.commons.math3.analysis.function.Asinh asinhFunction = - new org.apache.commons.math3.analysis.function.Asinh(); - - return asinhFunction.value(xAsDouble); - } - case "atanh": { - org.apache.commons.math3.analysis.function.Atanh atanhFunction = - new org.apache.commons.math3.analysis.function.Atanh(); - - return atanhFunction.value(xAsDouble); - } - default: { - throw new RuntimeException("Invalid inverse hyperbolic function: " + function); - } - } - - - } - - private Double convertObjectToDouble(Object x) { - Double xAsDouble = null; - - if (x instanceof Long) { - xAsDouble = ((Long) x).doubleValue(); - } - else if (x instanceof Integer) { - xAsDouble = ((Integer) x).doubleValue(); - } - else if (x instanceof Double) { - xAsDouble = (Double) x; - } - else if (x instanceof Float) { - xAsDouble = ((Float) x).doubleValue(); - } - else if (x instanceof String) { - xAsDouble = Double.valueOf((String) x); - } - else { - throw new RuntimeException(this.function + " input value couldn't be converted to Double. Expected Long, Integer, Double, Float or String."); - } - - return xAsDouble; - } + return xAsDouble; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/IsType.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/IsType.java index 54cfc61..b18f829 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/IsType.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/IsType.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF1; @@ -52,34 +51,33 @@ /** * UDF used for evals isBool, isInt, isNum and isStr
- * Checks the type of the input against Java types and - * returns true if it matches or false otherwise + * Checks the type of the input against Java types and returns true if it matches or false otherwise + * * @author eemhu - * */ public class IsType implements UDF1, Serializable { - private static final long serialVersionUID = 1L; - private String checkForType = "Boolean"; - - public IsType(String type) { - super(); - this.checkForType = type; - } + private static final long serialVersionUID = 1L; + private String checkForType = "Boolean"; + + public IsType(String type) { + super(); + this.checkForType = type; + } - @Override - public Boolean call(Object obj) throws Exception { - switch (checkForType) { - case "Boolean": - return (obj instanceof Boolean); - case "Integer": - return (obj instanceof Integer); - case "Numeric": - return (obj instanceof Integer || obj instanceof Double || obj instanceof Long || obj instanceof Float); - case "String": - return (obj instanceof String); - default: - throw new UnsupportedOperationException("TypeOf UDF cannot check for type: " + checkForType); - } - } + @Override + public Boolean call(Object obj) throws Exception { + switch (checkForType) { + case "Boolean": + return (obj instanceof Boolean); + case "Integer": + return (obj instanceof Integer); + case "Numeric": + return (obj instanceof Integer || obj instanceof Double || obj instanceof Long || obj instanceof Float); + case "String": + return (obj instanceof String); + default: + throw new UnsupportedOperationException("TypeOf UDF cannot check for type: " + checkForType); + } + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/JSONValid.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/JSONValid.java index 8993691..420414b 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/JSONValid.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/JSONValid.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import com.google.gson.Gson; @@ -58,26 +57,26 @@ * Where x is a field containing a string, either in JSON format or not.
* Returns TRUE for valid JSON, and FALSE for strings that are not considered to be valid JSON.
* The check is fairly strict, using com.google.gson.Gson. + * * @author eemhu - * */ public class JSONValid implements UDF1, Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; + + @Override + public Boolean call(String jsonStr) throws Exception { + boolean isValidJson = true; + + try { + new Gson().getAdapter(JsonElement.class).fromJson(jsonStr); + } + catch (IOException ie) { + // Gson will throw an IOException if jsonStr is not valid JSON + isValidJson = false; + } - @Override - public Boolean call(String jsonStr) throws Exception { - boolean isValidJson = true; - - try { - new Gson().getAdapter(JsonElement.class).fromJson(jsonStr); - } - catch (IOException ie) { - // Gson will throw an IOException if jsonStr is not valid JSON - isValidJson = false; - } - - return isValidJson; - } + return isValidJson; + } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/LikeComparison.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/LikeComparison.java index 03375fa..5f69a91 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/LikeComparison.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/LikeComparison.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -51,11 +51,11 @@ import java.util.regex.Pattern; /** - * UDF for '| where x like y' command - * Returns true or false if it matches - * Uses '%' for multi character wildcard, '_' for single + * UDF for '| where x like y' command Returns true or false if it matches Uses '%' for multi character wildcard, '_' for + * single */ public class LikeComparison implements UDF2 { + @Override public Boolean call(String input, String pattern) throws Exception { // input is the text to be compared diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/MinMax.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/MinMax.java index bcc0e01..898b0f8 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/MinMax.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/MinMax.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF2; @@ -58,10 +57,10 @@ import java.util.Optional; /** - * UDF to used to calculate eval min/max functions. - * The isMin boolean is used to choose the function. + * UDF to used to calculate eval min/max functions. The isMin boolean is used to choose the function. */ public class MinMax implements UDF2, Boolean, String> { + @Override public String call(WrappedArray items, Boolean isMin) throws Exception { Iterator it = items.iterator(); @@ -74,17 +73,20 @@ public String call(WrappedArray items, Boolean isMin) throws Exception { if (current instanceof String) { try { - Long.valueOf((String)current); - } catch (NumberFormatException nfe) { + Long.valueOf((String) current); + } + catch (NumberFormatException nfe) { try { - Double.valueOf((String)current); + Double.valueOf((String) current); outputType = DataTypes.DoubleType; - } catch (NumberFormatException nfe2) { + } + catch (NumberFormatException nfe2) { outputType = DataTypes.StringType; break; } } - } else if (current instanceof Double || current instanceof Float) { + } + else if (current instanceof Double || current instanceof Float) { outputType = DataTypes.DoubleType; } } @@ -95,17 +97,22 @@ public String call(WrappedArray items, Boolean isMin) throws Exception { if (isMin) { if (outputType.equals(DataTypes.StringType)) { result = javaList.stream().min(Comparator.comparing(Object::toString)); - } else if (outputType.equals(DataTypes.DoubleType)) { + } + else if (outputType.equals(DataTypes.DoubleType)) { result = javaList.stream().min(Comparator.comparing(a -> Double.valueOf(a.toString()))); - } else { + } + else { result = javaList.stream().min(Comparator.comparing(a -> Long.valueOf(a.toString()))); } - } else { + } + else { if (outputType.equals(DataTypes.StringType)) { result = javaList.stream().max(Comparator.comparing(Object::toString)); - } else if (outputType.equals(DataTypes.DoubleType)) { + } + else if (outputType.equals(DataTypes.DoubleType)) { result = javaList.stream().max(Comparator.comparing(a -> Double.valueOf(a.toString()))); - } else { + } + else { result = javaList.stream().max(Comparator.comparing(a -> Long.valueOf(a.toString()))); } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvdedup.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvdedup.java index 672f71d..a796c4e 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvdedup.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvdedup.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF1; @@ -51,29 +50,28 @@ import java.io.Serializable; - /** * Takes in an multivalue field and returns a multivalue field without duplicates.
+ * * @author eemhu - * */ public class Mvdedup implements UDF1, scala.collection.immutable.List>, Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; + + @Override + public scala.collection.immutable.List call(WrappedArray input) throws Exception { + // MV field is taken into UDF as Scala WrappedArray, + // and returned as Scala List + + // 1 or less values -> can't have any duplicates + if (input.size() <= 1) { + return input.toList(); + } - @Override - public scala.collection.immutable.List call(WrappedArray input) throws Exception { - // MV field is taken into UDF as Scala WrappedArray, - // and returned as Scala List - - // 1 or less values -> can't have any duplicates - if (input.size() <= 1) { - return input.toList(); - } - - // Otherwise, create stream and collect distinct values - // and return the collected list - return input.toStream().distinct().toList(); - } + // Otherwise, create stream and collect distinct values + // and return the collected list + return input.toStream().distinct().toList(); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvindex.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvindex.java index b2e4a63..2295766 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvindex.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvindex.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF4; @@ -61,39 +60,49 @@ * (2) mvindex(field, 0, -1) - return all elements
* (3) mvindex(field, 0) - return only the first element
* (4) mvindex(field, -1) - return only the last element
+ * * @author eemhu - * */ -public class Mvindex implements UDF4, Integer, Integer, Boolean, WrappedArray>, Serializable { +public class Mvindex + implements UDF4, Integer, Integer, Boolean, WrappedArray>, Serializable { + + private static final long serialVersionUID = 1L; + + @SuppressWarnings("unchecked") + @Override + public WrappedArray call( + WrappedArray mvField, + Integer startIndex, + Integer endIndex, + Boolean endIndexProvided + ) throws Exception { + // If endIndex was not given, get one element specified by startIndex + if (!endIndexProvided) { + if (startIndex != -1) { + // if start=0,1,2,... with no endIndex, get that element only + return ((WrappedArray) mvField.slice(startIndex, startIndex + 1)); + } + else { + // if start=-1, get last element + return ((WrappedArray) mvField.takeRight(1)); + } + } + else if (endIndex == -1) { + // if endIndex=-1, set it to last element + endIndex = mvField.size() - 1; + } - private static final long serialVersionUID = 1L; + // Drop elements from left and right based on given indices + // However, if nothing is to be dropped, don't even call the drop()/dropRight() function + int nDropFromRight = (mvField.size() - (endIndex + 1)); + if (nDropFromRight > 0) + mvField = (WrappedArray) mvField.dropRight(nDropFromRight); - @SuppressWarnings("unchecked") - @Override - public WrappedArray call(WrappedArray mvField, Integer startIndex, Integer endIndex, Boolean endIndexProvided) throws Exception { - // If endIndex was not given, get one element specified by startIndex - if (!endIndexProvided) { - if (startIndex != -1) { - // if start=0,1,2,... with no endIndex, get that element only - return ((WrappedArray)mvField.slice(startIndex, startIndex + 1)); - } else { - // if start=-1, get last element - return ((WrappedArray)mvField.takeRight(1)); - } - } else if (endIndex == -1) { - // if endIndex=-1, set it to last element - endIndex = mvField.size() - 1; - } - - // Drop elements from left and right based on given indices - // However, if nothing is to be dropped, don't even call the drop()/dropRight() function - int nDropFromRight = (mvField.size() - (endIndex+1)); - if (nDropFromRight > 0) mvField = (WrappedArray) mvField.dropRight(nDropFromRight); + int nDropFromLeft = startIndex; + if (nDropFromLeft > 0) + mvField = (WrappedArray) mvField.drop(nDropFromLeft); - int nDropFromLeft = startIndex; - if (nDropFromLeft > 0) mvField = (WrappedArray) mvField.drop(nDropFromLeft); - - return mvField; - } + return mvField; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvjoin.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvjoin.java index 1498eaa..08646a9 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvjoin.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvjoin.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF2; @@ -55,30 +54,31 @@ /** * UDF for command mvjoin(mvfield, str)
* Joins the mvfield's values with str delimiter into a new mv field
+ * * @author eemhu - * */ public class Mvjoin implements UDF2, String, String>, Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; + + @Override + public String call(WrappedArray mvfield, String str) throws Exception { + String rv = ""; + + Iterator it = mvfield.iterator(); + + while (it.hasNext()) { + String nextItem = it.next(); + + // Concatenate item + given delimiter + // hasNext() check is used to see if the current one is the final value, + // and delimiter won't be added if that is the case + rv = rv.concat(nextItem); + if (it.hasNext()) + rv = rv.concat(str); + } - @Override - public String call(WrappedArray mvfield, String str) throws Exception { - String rv = ""; - - Iterator it = mvfield.iterator(); - - while (it.hasNext()) { - String nextItem = it.next(); - - // Concatenate item + given delimiter - // hasNext() check is used to see if the current one is the final value, - // and delimiter won't be added if that is the case - rv = rv.concat(nextItem); - if (it.hasNext()) rv = rv.concat(str); - } - - return rv; - } + return rv; + } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvrange.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvrange.java index 9c24c70..c9cb4b9 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvrange.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvrange.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import com.teragrep.pth10.ast.time.RelativeTimeParser; @@ -57,71 +56,68 @@ /** * UDF used for command mvrange(start, end, step)
- * end is excluded from the resulting field, - * step can be an integer or a timespan (string)
- * - * While using a timespan, if the increment causes the - * time to increment past the end time, it will not - * be included in the resulting field.
+ * end is excluded from the resulting field, step can be an integer or a timespan (string)
+ * While using a timespan, if the increment causes the time to increment past the end time, it will not be included in + * the resulting field.
* * @author eemhu - * */ public class Mvrange implements UDF3>, Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; + + @Override + public List call(Integer start, Integer end, Object stepObj) throws Exception { + Integer step = null; + String stepStr = null; + + // Take step in as an Object, and check if it is String or Integer + // This allows the use of a single class for different input argument types, + // instead of making multiples of Mvrange for Integer / String. + if (stepObj instanceof Long) { + step = ((Long) stepObj).intValue(); + } + else if (stepObj instanceof Integer) { + step = ((Integer) stepObj); + } + else if (stepObj instanceof String) { + stepStr = ((String) stepObj); + } + else { + throw new RuntimeException( + "Mvrange: Step increment argument could not be interpreted into a valid argument. Make sure the argument is an integer or a timespan." + ); + } + + List rv = new ArrayList<>(); + // Numeric increment step + if (step != null) { + for (int i = start; i < end; i = i + step) { + rv.add(String.valueOf(i)); + } + } + // Unix time increment + else { + // Add start to mv field + long time = start; + rv.add(String.valueOf(time)); + + RelativeTimeParser rtParser = new RelativeTimeParser(); + RelativeTimestamp rtTimestamp = rtParser.parse("+" + stepStr); + // Go until incremented past end + while (time < end) { + time = rtTimestamp.calculate(new Timestamp(time * 1000L)); + + // If time went past end, stop incrementing and don't add to mv field + if (time > end) { + break; + } - @Override - public List call(Integer start, Integer end, Object stepObj) throws Exception { - Integer step = null; - String stepStr = null; - - // Take step in as an Object, and check if it is String or Integer - // This allows the use of a single class for different input argument types, - // instead of making multiples of Mvrange for Integer / String. - if (stepObj instanceof Long) { - step = ((Long)stepObj).intValue(); - } - else if (stepObj instanceof Integer) { - step = ((Integer) stepObj); - } - else if (stepObj instanceof String) { - stepStr = ((String) stepObj); - } - else { - throw new RuntimeException("Mvrange: Step increment argument could not be interpreted into a valid argument. Make sure the argument is an integer or a timespan."); - } - - - List rv = new ArrayList<>(); - // Numeric increment step - if (step != null) { - for (int i = start; i < end; i = i + step) { - rv.add(String.valueOf(i)); - } - } - // Unix time increment - else { - // Add start to mv field - long time = start; - rv.add(String.valueOf(time)); + rv.add(String.valueOf(time)); + } + } - RelativeTimeParser rtParser = new RelativeTimeParser(); - RelativeTimestamp rtTimestamp = rtParser.parse("+" + stepStr); - // Go until incremented past end - while ( time < end ) { - time = rtTimestamp.calculate(new Timestamp(time*1000L)); - - // If time went past end, stop incrementing and don't add to mv field - if ( time > end ) { - break; - } - - rv.add(String.valueOf(time)); - } - } - - return rv; - } + return rv; + } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvzip.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvzip.java index a7df380..21a46d2 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvzip.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Mvzip.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF3; @@ -59,32 +58,33 @@ * x = mv field 1
* y = mv field 2
* z = (optional) delimiter, defaults to comma
+ * * @author eemhu - * */ public class Mvzip implements UDF3, WrappedArray, String, List>, Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; + + @Override + public List call(WrappedArray mvfield1, WrappedArray mvfield2, String delimiter) + throws Exception { + // If delimiter is null, replace with default. + // Should not happen in practice since it is already checked in EvalStatement.evalMethodMvzipEmitCatalyst() + if (delimiter == null) { + delimiter = ","; + } + + // iterators for both mvfields + Iterator it1 = mvfield1.iterator(); + Iterator it2 = mvfield2.iterator(); + + // Go through the arrays and add to zipped list + List zipped = new ArrayList(); + while (it1.hasNext() && it2.hasNext()) { + zipped.add(it1.next() + delimiter + it2.next()); + } - @Override - public List call(WrappedArray mvfield1, WrappedArray mvfield2, String delimiter) throws Exception { - // If delimiter is null, replace with default. - // Should not happen in practice since it is already checked in EvalStatement.evalMethodMvzipEmitCatalyst() - if (delimiter == null) { - delimiter = ","; - } - - // iterators for both mvfields - Iterator it1 = mvfield1.iterator(); - Iterator it2 = mvfield2.iterator(); - - // Go through the arrays and add to zipped list - List zipped = new ArrayList(); - while (it1.hasNext() && it2.hasNext()) { - zipped.add(it1.next() + delimiter + it2.next()); - } - - return zipped; - } + return zipped; + } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/RandomNumber.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/RandomNumber.java index b4a8726..5cfb11f 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/RandomNumber.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/RandomNumber.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF0; @@ -57,17 +56,18 @@ * @author eemhu */ public class RandomNumber implements UDF0, Serializable { - private static final long serialVersionUID = 1L; - @Override - public Integer call() throws Exception { - double max = Math.pow(2d, 31d) - 1; - double min = 0; - Double rndNum = Math.floor(Math.random() * (max - min + 1)) + min; - - int res = rndNum.intValue(); - - return res; - } + private static final long serialVersionUID = 1L; + + @Override + public Integer call() throws Exception { + double max = Math.pow(2d, 31d) - 1; + double min = 0; + Double rndNum = Math.floor(Math.random() * (max - min + 1)) + min; + + int res = rndNum.intValue(); + + return res; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/RegexMatch.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/RegexMatch.java index ab22ffa..caacb5c 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/RegexMatch.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/RegexMatch.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import com.teragrep.pth10.ast.NullValue; @@ -63,108 +62,111 @@ * Returns true if regex matches subject, otherwise false.
* "isMultivalue=false" Goes through a normal field, and returns whether or not there was a match
* "isMultivalue=true" Goes through a multi-value field, and returns index of first match
+ * * @author eemhu - * */ public class RegexMatch implements UDF2, Serializable { - private static final long serialVersionUID = 1L; - private final boolean isMultiValue; - private final NullValue nullValue; - - public RegexMatch(NullValue nullValue) { - super(); - this.isMultiValue = false; - this.nullValue = nullValue; - } - - public RegexMatch(boolean isMultiValue, NullValue nullValue) { - super(); - this.isMultiValue = isMultiValue; - this.nullValue = nullValue; - } - - @Override - public Object call(Object subject, String regexString) throws Exception { - - String subjectStr = null; - - if (subject instanceof Long) { - subjectStr = ((Long)subject).toString(); - } - else if (subject instanceof Integer) { - subjectStr = ((Integer)subject).toString(); - } - else if (subject instanceof Double) { - subjectStr = ((Double)subject).toString(); - } - else if (subject instanceof Float) { - subjectStr = ((Double)subject).toString(); - } - else if (subject instanceof String) { - subjectStr = ((String)subject); - } - else if (subject instanceof java.sql.Timestamp) { - subjectStr = ((java.sql.Timestamp)subject).toString(); - } - - - if (!this.isMultiValue) { - return performForNormalField(subjectStr, regexString); - } - else { - @SuppressWarnings("unchecked") - WrappedArray subjectLst = (WrappedArray) subject; - - return performForMultiValueField(subjectLst, regexString); - } - - } - - // This gets called if isMultiValue=false - // Goes through a normal field, and returns whether or not there was a match - private Boolean performForNormalField(String subjectStr, String regexString) { - regexString = new UnquotedText(new TextString(regexString)).read(); - boolean isMatch = false; - - try { - Pattern p = Pattern.compile(regexString); - Matcher m = p.matcher(subjectStr); - isMatch = m.find(); - } - catch (PatternSyntaxException pse) { - throw new RuntimeException("Match command encountered an error compiling the regex pattern: " + pse.getMessage()); - } - - return isMatch; - } - - // This gets called if isMultiValue=true - // Goes through a multi-value field, and returns index of first match - private Object performForMultiValueField(WrappedArray subjectLst, String regexString) { - Pattern p; - - try { - p = Pattern.compile(regexString); - } - catch (PatternSyntaxException pse) { - throw new RuntimeException("Match command encountered an error compiling the regex pattern: " + pse.getMessage()); - } - - Iterator it = subjectLst.iterator(); - int i = 0; - while (it.hasNext()) { - Matcher m = p.matcher(it.next()); - boolean isMatch = m.find(); - - if (isMatch) { - return i; - } - - i++; - } - - return nullValue.value(); - } + private static final long serialVersionUID = 1L; + private final boolean isMultiValue; + private final NullValue nullValue; + + public RegexMatch(NullValue nullValue) { + super(); + this.isMultiValue = false; + this.nullValue = nullValue; + } + + public RegexMatch(boolean isMultiValue, NullValue nullValue) { + super(); + this.isMultiValue = isMultiValue; + this.nullValue = nullValue; + } + + @Override + public Object call(Object subject, String regexString) throws Exception { + + String subjectStr = null; + + if (subject instanceof Long) { + subjectStr = ((Long) subject).toString(); + } + else if (subject instanceof Integer) { + subjectStr = ((Integer) subject).toString(); + } + else if (subject instanceof Double) { + subjectStr = ((Double) subject).toString(); + } + else if (subject instanceof Float) { + subjectStr = ((Double) subject).toString(); + } + else if (subject instanceof String) { + subjectStr = ((String) subject); + } + else if (subject instanceof java.sql.Timestamp) { + subjectStr = ((java.sql.Timestamp) subject).toString(); + } + + if (!this.isMultiValue) { + return performForNormalField(subjectStr, regexString); + } + else { + @SuppressWarnings("unchecked") + WrappedArray subjectLst = (WrappedArray) subject; + + return performForMultiValueField(subjectLst, regexString); + } + + } + + // This gets called if isMultiValue=false + // Goes through a normal field, and returns whether or not there was a match + private Boolean performForNormalField(String subjectStr, String regexString) { + regexString = new UnquotedText(new TextString(regexString)).read(); + boolean isMatch = false; + + try { + Pattern p = Pattern.compile(regexString); + Matcher m = p.matcher(subjectStr); + isMatch = m.find(); + } + catch (PatternSyntaxException pse) { + throw new RuntimeException( + "Match command encountered an error compiling the regex pattern: " + pse.getMessage() + ); + } + + return isMatch; + } + + // This gets called if isMultiValue=true + // Goes through a multi-value field, and returns index of first match + private Object performForMultiValueField(WrappedArray subjectLst, String regexString) { + Pattern p; + + try { + p = Pattern.compile(regexString); + } + catch (PatternSyntaxException pse) { + throw new RuntimeException( + "Match command encountered an error compiling the regex pattern: " + pse.getMessage() + ); + } + + Iterator it = subjectLst.iterator(); + int i = 0; + while (it.hasNext()) { + Matcher m = p.matcher(it.next()); + boolean isMatch = m.find(); + + if (isMatch) { + return i; + } + + i++; + } + + return nullValue.value(); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Relative_time.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Relative_time.java index 16f26a8..fca1fd5 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Relative_time.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Relative_time.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import com.teragrep.pth10.ast.time.RelativeTimeParser; @@ -55,18 +54,18 @@ /** * UDF for command relative_time(unixtime, modifier)
+ * * @author eemhu - * */ public class Relative_time implements UDF2, Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; - @Override - public Long call(Long unixtime, String modifier) throws Exception { - RelativeTimeParser rtParser = new RelativeTimeParser(); - RelativeTimestamp rtTimestamp = rtParser.parse(modifier); - return rtTimestamp.calculate(new Timestamp(unixtime*1000L)); - } + @Override + public Long call(Long unixtime, String modifier) throws Exception { + RelativeTimeParser rtParser = new RelativeTimeParser(); + RelativeTimestamp rtTimestamp = rtParser.parse(modifier); + return rtTimestamp.calculate(new Timestamp(unixtime * 1000L)); + } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Sigfig.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Sigfig.java index 95f1dbf..646a6d8 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Sigfig.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Sigfig.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF3; @@ -58,132 +57,137 @@ import java.util.ArrayList; import java.util.List; - /** * UDF used for command sigfig(x)
- *

The computation for sigfig is based on the type of calculation that generates the number:

- *

* / result should have minimum number of significant figures of all of the operands
- * + - result should have the same amount of sigfigs as the least precise number of all of the operands

+ *

+ * The computation for sigfig is based on the type of calculation that generates the number: + *

+ *

+ * * / result should have minimum number of significant figures of all of the operands
+ * + - result should have the same amount of sigfigs as the least precise number of all of the operands + *

+ * * @author eemhu - * */ public class Sigfig implements UDF3, Double>, Serializable { - private static final Logger LOGGER = LoggerFactory.getLogger(Sigfig.class); - private static final long serialVersionUID = 1L; - - // Characters used for different arithmetic operations - private static final char MULTIPLICATION_CHAR = '*'; - private static final char DIVISION_CHAR = '/'; - private static final char ADDITION_CHAR = '+'; - private static final char SUBTRACTION_CHAR = '-'; + private static final Logger LOGGER = LoggerFactory.getLogger(Sigfig.class); + + private static final long serialVersionUID = 1L; + + // Characters used for different arithmetic operations + private static final char MULTIPLICATION_CHAR = '*'; + private static final char DIVISION_CHAR = '/'; + private static final char ADDITION_CHAR = '+'; + private static final char SUBTRACTION_CHAR = '-'; @Override - public Double call(Object calcResAsObject, String calcText, WrappedArray wrappedArrayOfColObjects) throws Exception { - - BigDecimal calcRes = new BigDecimal(calcResAsObject.toString()); - - // Get the iterator for WrappedArray - Iterator itr = wrappedArrayOfColObjects.iterator(); - List calculatedCols = new ArrayList<>(); - - // Get all objects (numbers) as BigDecimal with the iterator - // and put them into calculatedCols java list - // BigDecimal retains insignificant digits unlike double - while (itr.hasNext()) { - Object obj = itr.next(); - - if (obj instanceof Long) { - calculatedCols.add(BigDecimal.valueOf((Long)obj)); - } - else if (obj instanceof Integer) { - calculatedCols.add(BigDecimal.valueOf((Integer)obj)); - } - else if (obj instanceof Float) { - calculatedCols.add(BigDecimal.valueOf((Float)obj)); - } - else if (obj instanceof Double) { - calculatedCols.add(BigDecimal.valueOf((Double)obj)); - } - else if (obj instanceof String) { - calculatedCols.add(new BigDecimal((String)obj)); - } - else { - // Throw exception if number was not any of the types tested above - throw new RuntimeException("Sigfig: Could not parse field content into java.math.BigDecimal"); - } - } - - // The computation for sigfig is based on the type of calculation that generates the number - // * / result should have minimum number of significant figures of all of the operands - // + - result should have the same amount of sigfigs as the least precise number of all of the operands - - double rv; - BigDecimal input = calcRes; //BigDecimal.valueOf(calcRes); - - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("calc(result)= <{}>", calcRes); - LOGGER.debug("calc(text)= <{}>", calcText); - } - - int multiIndex = calcText.indexOf(MULTIPLICATION_CHAR); - int divIndex = calcText.indexOf(DIVISION_CHAR); - int plusIndex = calcText.indexOf(ADDITION_CHAR); - int minusIndex = calcText.indexOf(SUBTRACTION_CHAR); - - // * or / - if (multiIndex != -1 || divIndex != -1) { - int minPrecision = Integer.MAX_VALUE; - - for (BigDecimal val : calculatedCols) { - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("val=<{}>", val); - } - - BigDecimal currentValue = val; - int scale = currentValue.scale(); - int precision = currentValue.precision(); - - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("scale=<{}> precision=<{}>",scale,precision); - } - - if (precision < minPrecision) { - minPrecision = precision; - } - } - - rv = input.round(new MathContext(minPrecision)).doubleValue(); - } - // + or - - else if (plusIndex != -1 || minusIndex != -1) { - int minScale = Integer.MAX_VALUE; - - for (BigDecimal val : calculatedCols) { - if (LOGGER.isDebugEnabled()){ - LOGGER.debug("val=<{}>", val); - } - - BigDecimal currentValue = val; - int scale = currentValue.scale(); - int precision = currentValue.precision(); - - if (LOGGER.isDebugEnabled()){ - LOGGER.debug("scale=<{}> precision=<{}>", scale, precision); - } - - if (scale < minScale) { - minScale = scale; - } - } - - rv = input.round(new MathContext(minScale)).doubleValue(); - } - else { - // No * / + - found - throw new RuntimeException("Sigfig: Could not determine the type of arithmetic operation"); - } - - return rv; - } -} \ No newline at end of file + public Double call(Object calcResAsObject, String calcText, WrappedArray wrappedArrayOfColObjects) + throws Exception { + + BigDecimal calcRes = new BigDecimal(calcResAsObject.toString()); + + // Get the iterator for WrappedArray + Iterator itr = wrappedArrayOfColObjects.iterator(); + List calculatedCols = new ArrayList<>(); + + // Get all objects (numbers) as BigDecimal with the iterator + // and put them into calculatedCols java list + // BigDecimal retains insignificant digits unlike double + while (itr.hasNext()) { + Object obj = itr.next(); + + if (obj instanceof Long) { + calculatedCols.add(BigDecimal.valueOf((Long) obj)); + } + else if (obj instanceof Integer) { + calculatedCols.add(BigDecimal.valueOf((Integer) obj)); + } + else if (obj instanceof Float) { + calculatedCols.add(BigDecimal.valueOf((Float) obj)); + } + else if (obj instanceof Double) { + calculatedCols.add(BigDecimal.valueOf((Double) obj)); + } + else if (obj instanceof String) { + calculatedCols.add(new BigDecimal((String) obj)); + } + else { + // Throw exception if number was not any of the types tested above + throw new RuntimeException("Sigfig: Could not parse field content into java.math.BigDecimal"); + } + } + + // The computation for sigfig is based on the type of calculation that generates the number + // * / result should have minimum number of significant figures of all of the operands + // + - result should have the same amount of sigfigs as the least precise number of all of the operands + + double rv; + BigDecimal input = calcRes; //BigDecimal.valueOf(calcRes); + + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("calc(result)= <{}>", calcRes); + LOGGER.debug("calc(text)= <{}>", calcText); + } + + int multiIndex = calcText.indexOf(MULTIPLICATION_CHAR); + int divIndex = calcText.indexOf(DIVISION_CHAR); + int plusIndex = calcText.indexOf(ADDITION_CHAR); + int minusIndex = calcText.indexOf(SUBTRACTION_CHAR); + + // * or / + if (multiIndex != -1 || divIndex != -1) { + int minPrecision = Integer.MAX_VALUE; + + for (BigDecimal val : calculatedCols) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("val=<{}>", val); + } + + BigDecimal currentValue = val; + int scale = currentValue.scale(); + int precision = currentValue.precision(); + + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("scale=<{}> precision=<{}>", scale, precision); + } + + if (precision < minPrecision) { + minPrecision = precision; + } + } + + rv = input.round(new MathContext(minPrecision)).doubleValue(); + } + // + or - + else if (plusIndex != -1 || minusIndex != -1) { + int minScale = Integer.MAX_VALUE; + + for (BigDecimal val : calculatedCols) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("val=<{}>", val); + } + + BigDecimal currentValue = val; + int scale = currentValue.scale(); + int precision = currentValue.precision(); + + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("scale=<{}> precision=<{}>", scale, precision); + } + + if (scale < minScale) { + minScale = scale; + } + } + + rv = input.round(new MathContext(minScale)).doubleValue(); + } + else { + // No * / + - found + throw new RuntimeException("Sigfig: Could not determine the type of arithmetic operation"); + } + + return rv; + } +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Spath.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Spath.java index 7133d5b..4662870 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Spath.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Spath.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import com.google.gson.*; @@ -71,202 +70,227 @@ /** * UDF for command spath(json/xml, spath)
- *

First, the given (assumed to be) JSON/XML string is tried to be parsed as JSON, and if that fails, - * XML parsing is attempted. Otherwise the function will return an empty result, - * or the original input if the input and output column are set to the same column.

- * + *

+ * First, the given (assumed to be) JSON/XML string is tried to be parsed as JSON, and if that fails, XML parsing is + * attempted. Otherwise the function will return an empty result, or the original input if the input and output column + * are set to the same column. + *

* A separate 'xpath' command can be used for xpath expressions. * * @author eemhu */ public class Spath implements UDF4>, Serializable { - private static final Logger LOGGER = LoggerFactory.getLogger(Spath.class); - private static final long serialVersionUID = 1L; - private final NullValue nullValue; + private static final Logger LOGGER = LoggerFactory.getLogger(Spath.class); + private static final long serialVersionUID = 1L; + + private final NullValue nullValue; public Spath(NullValue nullValue) { - super(); - this.nullValue = nullValue; - } + super(); + this.nullValue = nullValue; + } + + @Override + public Map call(String input, String spathExpr, String nameOfInputCol, String nameOfOutputCol) + throws Exception { + // Map to return at the end of this function + final Map result = new HashMap<>(); + try { + // try json + final Gson gson = new Gson(); + final JsonElement jsonElem = gson.fromJson(input, JsonElement.class); - @Override - public Map call(String input, String spathExpr, String nameOfInputCol, String nameOfOutputCol) throws Exception { - // Map to return at the end of this function - final Map result = new HashMap<>(); - try { - // try json - final Gson gson = new Gson(); - final JsonElement jsonElem = gson.fromJson(input, JsonElement.class); + // Auto-extraction (JSON) + if (spathExpr == null) { + // expect topmost element to be an object + for (Map.Entry sub : jsonElem.getAsJsonObject().entrySet()) { + // put key:value to map - unescaping result in case was a nested json string + result + .put( + sub.getKey(), + new UnquotedText( + new TextString(StringEscapeUtils.unescapeJson(sub.getValue().toString())) + ).read() + ); + } + } + // Manual extraction via spath expression (JSON) + else { + final JsonElement jsonSubElem = getJsonElement( + jsonElem, new UnquotedText(new TextString(spathExpr)).read() + ); + // put key:value to map - unescaping result in case was a nested json string + result + .put( + spathExpr, + jsonSubElem != null ? new UnquotedText( + new TextString(StringEscapeUtils.unescapeJson(jsonSubElem.toString())) + ).read() : nullValue.value() + ); + } + return result; + } + catch (JsonSyntaxException | ClassCastException json_fail) { + LOGGER.warn("Processing failed as JSON, trying XML parsing. Error: <{}>", json_fail.getMessage()); + // try xml + try { + Document doc = getXmlDocFromString(input); - // Auto-extraction (JSON) - if (spathExpr == null) { - // expect topmost element to be an object - for (Map.Entry sub : jsonElem.getAsJsonObject().entrySet()) { - // put key:value to map - unescaping result in case was a nested json string - result.put(sub.getKey(), new UnquotedText(new TextString(StringEscapeUtils.unescapeJson(sub.getValue().toString()))).read()); - } - } - // Manual extraction via spath expression (JSON) - else { - final JsonElement jsonSubElem = getJsonElement(jsonElem, new UnquotedText(new TextString(spathExpr)).read()); - // put key:value to map - unescaping result in case was a nested json string - result.put(spathExpr, jsonSubElem != null ? - new UnquotedText(new TextString(StringEscapeUtils.unescapeJson(jsonSubElem.toString()))).read() : - nullValue.value()); - } - return result; - } catch (JsonSyntaxException | ClassCastException json_fail) { - LOGGER.warn("Processing failed as JSON, trying XML parsing. Error: <{}>", json_fail.getMessage()); - // try xml - try { - Document doc = getXmlDocFromString(input); + if (doc == null) { + // failed to make document from string + return result; + } - if (doc == null) { - // failed to make document from string - return result; - } - - // Auto-extraction (XML) - if (spathExpr == null) { - // Each tag-pair containing text inside will be given a new column - // main-sub-item would contain all for that type of nested tags, etc. - final Node rootNode = doc.getDocumentElement(); - buildMapFromXmlNodes(rootNode, ".", result); - } - // Manual extraction via spath expression (XML) - else { - // spath expects spath at all times, even when input is XML - // spath needs to be converted to xpath for xml - final XPath xPath = XPathFactory.newInstance().newXPath(); + // Auto-extraction (XML) + if (spathExpr == null) { + // Each tag-pair containing text inside will be given a new column + // main-sub-item would contain all for that type of nested tags, etc. + final Node rootNode = doc.getDocumentElement(); + buildMapFromXmlNodes(rootNode, ".", result); + } + // Manual extraction via spath expression (XML) + else { + // spath expects spath at all times, even when input is XML + // spath needs to be converted to xpath for xml + final XPath xPath = XPathFactory.newInstance().newXPath(); - // spath is of type main.sub.item, convert to /main/sub/item - String spathAsXpath = "/".concat(new UnquotedText(new TextString(spathExpr)).read()).replaceAll("\\.","/"); - LOGGER.debug("spath->xpath conversion: <[{}]>", spathAsXpath); + // spath is of type main.sub.item, convert to /main/sub/item + String spathAsXpath = "/" + .concat(new UnquotedText(new TextString(spathExpr)).read()) + .replaceAll("\\.", "/"); + LOGGER.debug("spath->xpath conversion: <[{}]>", spathAsXpath); - String rv = (String) xPath.compile(spathAsXpath).evaluate(doc, XPathConstants.STRING); - result.put(spathExpr, rv.trim()); - } - return result; - } catch (Exception e) { - LOGGER.warn("spath: The content couldn't be parsed as JSON or XML. Details: <{}>", e.getMessage()); - // return pre-existing content if output is the same as input - if (nameOfInputCol.equals(nameOfOutputCol)) { - result.put(spathExpr, input); - } - // otherwise output will be empty on error - else { - result.put(spathExpr, nullValue.value()); - } - return result; - } + String rv = (String) xPath.compile(spathAsXpath).evaluate(doc, XPathConstants.STRING); + result.put(spathExpr, rv.trim()); + } + return result; + } + catch (Exception e) { + LOGGER.warn("spath: The content couldn't be parsed as JSON or XML. Details: <{}>", e.getMessage()); + // return pre-existing content if output is the same as input + if (nameOfInputCol.equals(nameOfOutputCol)) { + result.put(spathExpr, input); + } + // otherwise output will be empty on error + else { + result.put(spathExpr, nullValue.value()); + } + return result; + } - } - } + } + } - /** - * Gets JSON element from JSON based on the given SPath expression - * @param json JSONElement to get the (sub)element from - * @param spath SPath expression which expresses the element to get - */ - private JsonElement getJsonElement(final JsonElement json, final String spath) { - final String[] parts = spath.split("[.\\[\\]]"); - JsonElement rv = json; + /** + * Gets JSON element from JSON based on the given SPath expression + * + * @param json JSONElement to get the (sub)element from + * @param spath SPath expression which expresses the element to get + */ + private JsonElement getJsonElement(final JsonElement json, final String spath) { + final String[] parts = spath.split("[.\\[\\]]"); + JsonElement rv = json; - for (String key : parts) { - key = key.trim(); + for (String key : parts) { + key = key.trim(); - LOGGER.debug("Got key: <{}>", key); + LOGGER.debug("Got key: <{}>", key); - if (key.isEmpty()) { - LOGGER.debug("Key was empty"); - continue; - } + if (key.isEmpty()) { + LOGGER.debug("Key was empty"); + continue; + } - if (rv == null || rv.isJsonNull()) { - LOGGER.debug("Given JsonElement was a NULL"); - rv = JsonNull.INSTANCE; - break; - } + if (rv == null || rv.isJsonNull()) { + LOGGER.debug("Given JsonElement was a NULL"); + rv = JsonNull.INSTANCE; + break; + } - if (rv.isJsonObject()) { - LOGGER.debug("Given JsonElement was an OBJECT"); - rv = ((JsonObject)rv).get(key); - } else if (rv.isJsonArray()) { - LOGGER.debug("Given JsonElement was an ARRAY"); - int i = Integer.parseInt(key) - 1; - rv = ((JsonArray)rv).get(i); - } else { - LOGGER.debug("Given JsonElement was something else"); - break; - } - } + if (rv.isJsonObject()) { + LOGGER.debug("Given JsonElement was an OBJECT"); + rv = ((JsonObject) rv).get(key); + } + else if (rv.isJsonArray()) { + LOGGER.debug("Given JsonElement was an ARRAY"); + int i = Integer.parseInt(key) - 1; + rv = ((JsonArray) rv).get(i); + } + else { + LOGGER.debug("Given JsonElement was something else"); + break; + } + } - return rv; - } + return rv; + } - /** - * Convert an XML-formatted string to a Document object - * @param xmlStr XML-formatted string - * @return (XML) Document object - */ - private Document getXmlDocFromString(final String xmlStr) { - final DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); + /** + * Convert an XML-formatted string to a Document object + * + * @param xmlStr XML-formatted string + * @return (XML) Document object + */ + private Document getXmlDocFromString(final String xmlStr) { + final DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); - DocumentBuilder docBuilder; + DocumentBuilder docBuilder; - try { - docBuilder = docBuilderFactory.newDocumentBuilder(); - return docBuilder.parse(new InputSource(new StringReader(xmlStr))); - } catch (Exception e) { - LOGGER.warn("Failed to parse XML: <{}>", e); - return null; - } - } + try { + docBuilder = docBuilderFactory.newDocumentBuilder(); + return docBuilder.parse(new InputSource(new StringReader(xmlStr))); + } + catch (Exception e) { + LOGGER.warn("Failed to parse XML: <{}>", e); + return null; + } + } - /** - * Adds all 'node tag - contents' Key-Value pairs to the map - *
tag.tag.tag => contents
- * @param rootNode root node (Main Document Element) - * @param spacer Spacer string between each parent->child in key name - * @param map Final map to be returned out of the UDF - */ - private void buildMapFromXmlNodes(final Node rootNode, final String spacer, final Map map) { - // RootNode is text - if (rootNode.getNodeName().equals("#text")) { - String colName = ""; + /** + * Adds all 'node tag - contents' Key-Value pairs to the map
tag.tag.tag => contents
+ * + * @param rootNode root node (Main Document Element) + * @param spacer Spacer string between each parent->child in key name + * @param map Final map to be returned out of the UDF + */ + private void buildMapFromXmlNodes(final Node rootNode, final String spacer, final Map map) { + // RootNode is text + if (rootNode.getNodeName().equals("#text")) { + String colName = ""; - Node parent = rootNode.getParentNode(); - boolean isFirst = true; + Node parent = rootNode.getParentNode(); + boolean isFirst = true; - // add each parent of parent as the top-most label - // grandparent.parent.child - // loop goes from child -> parent -> grandparent - while (parent != null) { - // top-most parent is #document, not needed - if (!parent.getNodeName().equals("#document")) { - colName = parent.getNodeName().concat(isFirst ? "" : spacer).concat(colName); - } + // add each parent of parent as the top-most label + // grandparent.parent.child + // loop goes from child -> parent -> grandparent + while (parent != null) { + // top-most parent is #document, not needed + if (!parent.getNodeName().equals("#document")) { + colName = parent.getNodeName().concat(isFirst ? "" : spacer).concat(colName); + } - // get parent of parent - parent = parent.getParentNode(); - isFirst = false; - } + // get parent of parent + parent = parent.getParentNode(); + isFirst = false; + } - // if there are multiple columns of the same name, add value to existing column - if (map.containsKey(colName)) { - String existingValue = map.get(colName); - map.put(colName, existingValue.concat("\n").concat(rootNode.getTextContent())); - } else { - map.put(colName, rootNode.getTextContent()); - } - } + // if there are multiple columns of the same name, add value to existing column + if (map.containsKey(colName)) { + String existingValue = map.get(colName); + map.put(colName, existingValue.concat("\n").concat(rootNode.getTextContent())); + } + else { + map.put(colName, rootNode.getTextContent()); + } + } - final NodeList nl = rootNode.getChildNodes(); - // Visit children of current node - for (int i = 0; i < nl.getLength(); i++) { - Node child = nl.item(i); - buildMapFromXmlNodes(child, spacer, map); - } - } + final NodeList nl = rootNode.getChildNodes(); + // Visit children of current node + for (int i = 0; i < nl.getLength(); i++) { + Node child = nl.item(i); + buildMapFromXmlNodes(child, spacer, map); + } + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TimeToUnixTime.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TimeToUnixTime.java index 228ffbe..1c6d093 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TimeToUnixTime.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TimeToUnixTime.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF1; @@ -58,12 +57,12 @@ import java.time.format.DateTimeFormatter; /** - * Converts Timestamp/String object to a unix epoch, - * otherwise Long/Integer will be returned as-is + * Converts Timestamp/String object to a unix epoch, otherwise Long/Integer will be returned as-is */ public class TimeToUnixTime implements UDF1 { private static final Logger LOGGER = LoggerFactory.getLogger(TimeToUnixTime.class); + /** * @param time Object of type timestamp or unix time * @return as unix time @@ -74,27 +73,27 @@ public Long call(Object time) throws Exception { long unixtime = 0L; if (time instanceof Timestamp) { - Timestamp ts = (Timestamp)time; - LOGGER.debug("Time was detected as TIMESTAMP, epoch: <{}>", ts.getTime()/1000L); + Timestamp ts = (Timestamp) time; + LOGGER.debug("Time was detected as TIMESTAMP, epoch: <{}>", ts.getTime() / 1000L); unixtime = ts.getTime() / 1000L; } else if (time instanceof Long) { LOGGER.debug("Time was directly a LONG/EPOCH: <{}>", time); - unixtime = (Long)time; + unixtime = (Long) time; } else if (time instanceof Integer) { LOGGER.debug("Time was an INTEGER: <{}>", time); - unixtime = ((Integer)time).longValue(); + unixtime = ((Integer) time).longValue(); } else if (time instanceof String) { LOGGER.debug("Time was a STRING: <{}>", time); try { LOGGER.debug("Attempting to use as-is (epoch)"); - unixtime = Long.parseLong(((String)time)); + unixtime = Long.parseLong(((String) time)); } catch (NumberFormatException nfe) { LOGGER.debug("Failed, attempting to parse as a ISO_ZONED_DATE_TIME"); - String timeStr = (String)time; + String timeStr = (String) time; LocalDateTime ldt = LocalDateTime.parse(timeStr, DateTimeFormatter.ISO_ZONED_DATE_TIME); ZonedDateTime zdt = ZonedDateTime.of(ldt, ZoneId.systemDefault()); ZonedDateTime asUtc = zdt.withZoneSameInstant(ZoneOffset.UTC); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TimestampArithmetic.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TimestampArithmetic.java index 8af93c2..dedd893 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TimestampArithmetic.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TimestampArithmetic.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF3; @@ -51,6 +50,7 @@ import java.sql.Timestamp; public class TimestampArithmetic implements UDF3 { + @Override public Timestamp call(Object o, Object o2, String op) throws Exception { // Convert all of them to long @@ -59,19 +59,19 @@ public Timestamp call(Object o, Object o2, String op) throws Exception { // Left side if (o instanceof Timestamp) { - left = ((Timestamp)o).getTime()/1000L; + left = ((Timestamp) o).getTime() / 1000L; } else if (o instanceof Long) { - left = (Long)o; + left = (Long) o; } else if (o instanceof String) { - left = Long.parseLong(((String)o)); + left = Long.parseLong(((String) o)); } else if (o instanceof Double) { - left = ((Double)o).longValue(); + left = ((Double) o).longValue(); } else if (o instanceof Integer) { - left = ((Integer)o).longValue(); + left = ((Integer) o).longValue(); } else { throw new RuntimeException("Left side of timestamp arithmetic was of unsupported type: " + o.toString()); @@ -79,19 +79,19 @@ else if (o instanceof Integer) { // Right side if (o2 instanceof Timestamp) { - right = ((Timestamp)o2).getTime()/1000L; + right = ((Timestamp) o2).getTime() / 1000L; } else if (o2 instanceof Long) { - right = (Long)o2; + right = (Long) o2; } else if (o2 instanceof String) { - right = Long.parseLong(((String)o2)); + right = Long.parseLong(((String) o2)); } else if (o2 instanceof Double) { - right = ((Double)o2).longValue(); + right = ((Double) o2).longValue(); } else if (o2 instanceof Integer) { - right = ((Integer)o2).longValue(); + right = ((Integer) o2).longValue(); } else { throw new RuntimeException("Right side of timestamp arithmetic was of unsupported type: " + o2.toString()); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Tonumber.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Tonumber.java index bc58059..bd96820 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Tonumber.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/Tonumber.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import com.teragrep.pth10.ast.NullValue; @@ -57,34 +56,37 @@ * UDF for eval method tonumber(numstr, base)
* Converts a numeric string to a long of base. */ -public class Tonumber implements UDF2, Serializable{ - private static final Logger LOGGER = LoggerFactory.getLogger(Tonumber.class); - private static final long serialVersionUID = 1L; - private final NullValue nullValue; +public class Tonumber implements UDF2, Serializable { + + private static final Logger LOGGER = LoggerFactory.getLogger(Tonumber.class); + private static final long serialVersionUID = 1L; + private final NullValue nullValue; + + public Tonumber(NullValue nullValue) { + super(); + this.nullValue = nullValue; + } + + @Override + public Object call(String numstr, Integer base) throws Exception { + Object rv = nullValue.value(); + + if (base < 2 || base > 36) { + throw new UnsupportedOperationException( + "Tonumber: 'base' argument should be an integer value between 2 and 36." + ); + } - public Tonumber(NullValue nullValue) { - super(); - this.nullValue = nullValue; - } + // try parsing, otherwise return null + try { + rv = Long.valueOf(numstr, base); + } + catch (NumberFormatException nfe) { + LOGGER.warn("Tonumber: Error parsing, returning 'null'. Details: <{}>", nfe.getMessage()); + // Could not parse, return null + } - @Override - public Object call(String numstr, Integer base) throws Exception { - Object rv = nullValue.value(); - - if (base < 2 || base > 36) { - throw new UnsupportedOperationException("Tonumber: 'base' argument should be an integer value between 2 and 36."); - } - - // try parsing, otherwise return null - try { - rv = Long.valueOf(numstr, base); - } - catch (NumberFormatException nfe) { - LOGGER.warn("Tonumber: Error parsing, returning 'null'. Details: <{}>", nfe.getMessage()); - // Could not parse, return null - } - - return rv; - } + return rv; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TypeOf.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TypeOf.java index 6000017..888113e 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TypeOf.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/TypeOf.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF1; @@ -51,7 +50,9 @@ import java.io.Serializable; /** - *

UDF used for eval typeof()

+ *

+ * UDF used for eval typeof() + *

*
  *     Java Type 	| TypeOf Result
  *     ----------------------------------
@@ -64,29 +65,32 @@
  *     String 		| String
  *     (Other) 		| Invalid
  * 
+ * * @author eemhu - * */ public class TypeOf implements UDF1, Serializable { - private static final long serialVersionUID = 1L; - - @Override - public String call(Object obj) throws Exception { - - if (obj instanceof Boolean) { - return "Boolean"; - } - else if (obj instanceof Integer || obj instanceof Long || obj instanceof Double || obj instanceof Float || obj instanceof java.sql.Timestamp) { - // timestamp is also a number based on docs - return "Number"; - } - else if (obj instanceof String) { - return "String"; - } - else { - return "Invalid"; - } - - } + private static final long serialVersionUID = 1L; + + @Override + public String call(Object obj) throws Exception { + + if (obj instanceof Boolean) { + return "Boolean"; + } + else if ( + obj instanceof Integer || obj instanceof Long || obj instanceof Double || obj instanceof Float + || obj instanceof java.sql.Timestamp + ) { + // timestamp is also a number based on docs + return "Number"; + } + else if (obj instanceof String) { + return "String"; + } + else { + return "Invalid"; + } + + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/UrlDecode.java b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/UrlDecode.java index be9106b..a203144 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/UrlDecode.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/evalstatement/UDFs/UrlDecode.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.evalstatement.UDFs; import org.apache.spark.sql.api.java.UDF1; @@ -53,25 +52,27 @@ /** * UDF for command urldecode(x)
* Returns a decoded URL string. + * * @author eemhu - * */ public class UrlDecode implements UDF1, Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; + + @Override + public String call(String encodedUrl) throws Exception { + String decodedUrl; + + try { + decodedUrl = java.net.URLDecoder.decode(encodedUrl, java.nio.charset.StandardCharsets.UTF_8.name()); + } + catch (java.io.UnsupportedEncodingException uee) { + throw new RuntimeException( + "An error occurred decoding the URL. UnsupportedEncodingException: " + uee.getMessage() + ); + } + + return decodedUrl; + } - @Override - public String call(String encodedUrl) throws Exception { - String decodedUrl; - - try { - decodedUrl = java.net.URLDecoder.decode(encodedUrl, java.nio.charset.StandardCharsets.UTF_8.name()); - } - catch (java.io.UnsupportedEncodingException uee) { - throw new RuntimeException("An error occurred decoding the URL. UnsupportedEncodingException: " + uee.getMessage()); - } - - return decodedUrl; - } - } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/LogicalStatementCatalyst.java b/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/LogicalStatementCatalyst.java index 58278f1..26c7a6a 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/LogicalStatementCatalyst.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/LogicalStatementCatalyst.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.logicalstatement; import com.teragrep.jue_01.GlobToRegEx; @@ -75,19 +74,22 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; - /** - *

Contains the visitor functions for logicalStatement, which is used - * for the main search function of the language.

- *

These functions help to build the necessary archive query and Spark actions.

- * Example: - *
index=cinnamon earliest=-1y latest=-1d
- *

After the main logicalStatement, multiple - * {@link com.teragrep.pth10.ast.commands.transformstatement.TransformStatement transformStatements} - * that contain aggregations and other functions can be chained, or left unused if the user wants - * to perform a basic search.

+ *

+ * Contains the visitor functions for logicalStatement, which is used for the main search function of the language. + *

+ *

+ * These functions help to build the necessary archive query and Spark actions. + *

+ * Example:
index=cinnamon earliest=-1y latest=-1d
+ *

+ * After the main logicalStatement, multiple + * {@link com.teragrep.pth10.ast.commands.transformstatement.TransformStatement transformStatements} that contain + * aggregations and other functions can be chained, or left unused if the user wants to perform a basic search. + *

*/ public class LogicalStatementCatalyst extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(LogicalStatementCatalyst.class); private final DPLParserCatalystContext catCtx; @@ -114,6 +116,7 @@ public LogicalStatementCatalyst(DPLParserCatalystContext catCtx) { /** * Visits the parse tree for SearchTransformationRoot and returns a LogicalCatalystStep that can be added to * Steptree in DPLParserCatalystVisitor. + * * @param ctx SearchTransformationRootContext * @return LogicalCatalystStep */ @@ -129,19 +132,22 @@ public AbstractStep visitLogicalStatementCatalyst(DPLParser.SearchTransformation } /** - * The main visitor function for searchTransformation, used for the main search function. - *
+     * The main visitor function for searchTransformation, used for the main search function. 
      *     root : searchTransformationRoot transformStatement?
      *     searchTransformationRoot : logicalStatement
      * 
+ * * @param ctx SearchTransformationRoot context * @return logicalStatement columnNode */ @Override public Node visitSearchTransformationRoot(DPLParser.SearchTransformationRootContext ctx) { ColumnNode rv; - LOGGER.info("[SearchTransformationRoot CAT] Visiting: <{}> with <{}> children", ctx.getText(), ctx.getChildCount()); - + LOGGER + .info( + "[SearchTransformationRoot CAT] Visiting: <{}> with <{}> children", ctx.getText(), + ctx.getChildCount() + ); if (ctx.getChildCount() == 1) { // just a single directoryStatement -or- logicalStatement @@ -150,8 +156,9 @@ public Node visitSearchTransformationRoot(DPLParser.SearchTransformationRootCont else { ParseTree secondChild = ctx.getChild(1); - if (secondChild instanceof TerminalNode && - ((TerminalNode) secondChild).getSymbol().getType() == DPLLexer.OR) { + if ( + secondChild instanceof TerminalNode && ((TerminalNode) secondChild).getSymbol().getType() == DPLLexer.OR + ) { // case: directoryStmt OR logicalStmt ColumnNode dirStatColumnNode = (ColumnNode) visit(ctx.directoryStatement()); ColumnNode logiStatColumnNode = (ColumnNode) visit(ctx.logicalStatement(0)); @@ -162,23 +169,24 @@ public Node visitSearchTransformationRoot(DPLParser.SearchTransformationRootCont Column finalColumn = ((ColumnNode) visit(ctx.directoryStatement())).getColumn(); for (DPLParser.LogicalStatementContext logiStatCtx : ctx.logicalStatement()) { - finalColumn = finalColumn.and(((ColumnNode)visit(logiStatCtx)).getColumn()); + finalColumn = finalColumn.and(((ColumnNode) visit(logiStatCtx)).getColumn()); } rv = new ColumnNode(finalColumn); } } - if (rv != null && rv.getColumn() != null) { - LOGGER.info("Spark column: <{}>", rv.getColumn().toString()); - this.catCtx.setSparkQuery(rv.getColumn().toString()); - } + if (rv != null && rv.getColumn() != null) { + LOGGER.info("Spark column: <{}>", rv.getColumn().toString()); + this.catCtx.setSparkQuery(rv.getColumn().toString()); + } return rv; } /** * Prints parse tree string, IF catCtx has been setRuleNames(parser.getRuleNames()) + * * @param ctx current context * @return parse tree as string */ @@ -217,8 +225,9 @@ public Node visitDirectoryStatement(DPLParser.DirectoryStatementContext ctx) { ParseTree secondChild = ctx.getChild(1); // check if directoryStmt OR directoryStmt - if (secondChild instanceof TerminalNode && - ((TerminalNode) secondChild).getSymbol().getType() == DPLLexer.OR) { + if ( + secondChild instanceof TerminalNode && ((TerminalNode) secondChild).getSymbol().getType() == DPLLexer.OR + ) { LOGGER.debug("[DirStmt] OR detected"); orMode = true; } @@ -242,14 +251,14 @@ public Node visitDirectoryStatement(DPLParser.DirectoryStatementContext ctx) { Column c = null; if (n instanceof SubSearchNode) { - c = ((SubSearchNode)n).getColumn(); + c = ((SubSearchNode) n).getColumn(); } else if (n instanceof ColumnNode) { - c = ((ColumnNode)n).getColumn(); + c = ((ColumnNode) n).getColumn(); } else if (n instanceof CatalystNode) { LOGGER.debug("Got dataset from sub search"); - subSearchDs = ((CatalystNode)n).getDataset(); + subSearchDs = ((CatalystNode) n).getDataset(); continue; } else { @@ -301,7 +310,8 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { if (!(ctx.getChild(0) instanceof TerminalNode)) { left = visit(ctx.getChild(0)); - } else { + } + else { leftIsTerminal = (TerminalNode) ctx.getChild(0); if (leftIsTerminal.getSymbol().getType() != DPLLexer.NOT) { throw new RuntimeException("Unsupported unary logical operation: " + ctx.getText()); @@ -311,13 +321,15 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { if (ctx.getChildCount() == 1) { // leaf rv = left; - } else if (ctx.getChildCount() == 2) { + } + else if (ctx.getChildCount() == 2) { Node right = visit(ctx.getChild(1)); if (leftIsTerminal != null) { Column r = ((ColumnNode) right).getColumn(); // Use unary operation, currently only NOT is supported rv = new ColumnNode(functions.not(r)); - } else { + } + else { if (left instanceof ColumnNode && right instanceof ColumnNode) { Column l = ((ColumnNode) left).getColumn(); Column r = ((ColumnNode) right).getColumn(); @@ -326,7 +338,8 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { } } - } else if (ctx.getChildCount() == 3) { + } + else if (ctx.getChildCount() == 3) { TerminalNode operation = (TerminalNode) ctx.getChild(1); Node right = visit(ctx.getChild(2)); if (left instanceof ColumnNode && right instanceof ColumnNode) { @@ -335,7 +348,8 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { // resolve operation if (DPLLexer.AND == operation.getSymbol().getType()) { rv = new ColumnNode(l.and(r)); - } else if (DPLLexer.OR == operation.getSymbol().getType()) + } + else if (DPLLexer.OR == operation.getSymbol().getType()) rv = new ColumnNode(l.or(r)); else { throw new RuntimeException("Unsupported logical operation:" + operation); @@ -345,7 +359,7 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { if (rv instanceof SubSearchNode) { LOGGER.info("[CAT] [LogiStat] Return value was SubsearchNode. Converting to ColumnNode!"); - rv = new ColumnNode(((SubSearchNode)rv).getColumn()); + rv = new ColumnNode(((SubSearchNode) rv).getColumn()); } LOGGER.debug("visitLogicalStatement outgoing: <{}>", rv); @@ -403,11 +417,10 @@ public Node visitSubindexStatement(DPLParser.SubindexStatementContext ctx) { } /** - * searchQualifier : INDEX (EQ|NEQ) stringType WILDCARD? | SOURCETYPE (EQ|NEQ) - * stringType WILDCARD? | HOST (EQ|NEQ) stringType WILDCARD? | SOURCE (EQ|NEQ) - * stringType WILDCARD? | SAVEDSEARCH (EQ|NEQ) stringType WILDCARD? | EVENTTYPE - * (EQ|NEQ) stringType WILDCARD? | EVENTTYPETAG (EQ|NEQ) stringType WILDCARD? | - * HOSTTAG (EQ|NEQ) stringType WILDCARD? | TAG (EQ|NEQ) stringType WILDCARD? ; + * searchQualifier : INDEX (EQ|NEQ) stringType WILDCARD? | SOURCETYPE (EQ|NEQ) stringType WILDCARD? | HOST (EQ|NEQ) + * stringType WILDCARD? | SOURCE (EQ|NEQ) stringType WILDCARD? | SAVEDSEARCH (EQ|NEQ) stringType WILDCARD? | + * EVENTTYPE (EQ|NEQ) stringType WILDCARD? | EVENTTYPETAG (EQ|NEQ) stringType WILDCARD? | HOSTTAG (EQ|NEQ) + * stringType WILDCARD? | TAG (EQ|NEQ) stringType WILDCARD? ; */ @Override public Node visitSearchQualifier(DPLParser.SearchQualifierContext ctx) { @@ -452,7 +465,8 @@ else if (left.getSymbol().getType() == DPLLexer.SOURCETYPE) { String rlikeStatement = glob2rlike(value); sQualifier = col.rlike(rlikeStatement); - } else if (left.getSymbol().getType() == DPLLexer.INDEX_IN) { + } + else if (left.getSymbol().getType() == DPLLexer.INDEX_IN) { for (String index : listOfIndices) { String rlikeStatement = glob2rlike(index); if (sQualifier == null) { @@ -463,7 +477,8 @@ else if (left.getSymbol().getType() == DPLLexer.SOURCETYPE) { } } - } else { + } + else { String rlikeStatement = glob2rlike(value); sQualifier = functions.not(col.rlike(rlikeStatement)); } @@ -515,9 +530,9 @@ public Node visitIndexStatement(DPLParser.IndexStatementContext ctx) { /** * {@inheritDoc} - * - *

The default implementation returns the result of calling - * {@link #visitChildren} on {@code ctx}.

+ *

+ * The default implementation returns the result of calling {@link #visitChildren} on {@code ctx}. + *

*/ @Override public Node visitComparisonStatement(DPLParser.ComparisonStatementContext ctx) { @@ -548,22 +563,22 @@ public Node visitComparisonStatement(DPLParser.ComparisonStatementContext ctx) { } } - if (!specialCase) { col = new Column(field); - rv = this.addOperation(col, (TerminalNode) ctx.getChild(1), new UnquotedText(new TextString(ctx.getChild(2).getText())).read()); + rv = this + .addOperation(col, (TerminalNode) ctx.getChild(1), new UnquotedText(new TextString(ctx.getChild(2).getText())).read()); } return new ColumnNode(rv); } - private Column addOperation(Column source, TerminalNode operation, String value) { Column rv = null; SparkSession ss = catCtx.getSparkSession(); ss.udf().register("Comparison", new SearchComparison(), DataTypes.BooleanType); - rv = functions.callUDF("Comparison", source, functions.lit(operation.getSymbol().getType()), functions.lit(value)); + rv = functions + .callUDF("Comparison", source, functions.lit(operation.getSymbol().getType()), functions.lit(value)); return rv; } @@ -596,7 +611,6 @@ public Node visitSubsearchStatement(DPLParser.SubsearchStatementContext ctx) { return null; } - /*@Override public Node visitSubsearchStatement(DPLParser.SubsearchStatementContext ctx) { LOGGER.info("visitSubsearchStatement:" + ctx.getText()); @@ -638,8 +652,7 @@ public Node visitFieldType(DPLParser.FieldTypeContext ctx) { } /** - * subEvalLogicalStatement : PARENTHESIS_L subEvalLogicalStatement PARENTHESIS_R - * ; + * subEvalLogicalStatement : PARENTHESIS_L subEvalLogicalStatement PARENTHESIS_R ; */ @Override public Node visitL_evalStatement_subEvalStatement(DPLParser.L_evalStatement_subEvalStatementContext ctx) { @@ -656,7 +669,6 @@ public Node visitT_chart_by_column_rowOptions(DPLParser.T_chart_by_column_rowOpt return chartTransformation.visitT_chart_by_column_rowOptions(ctx); } - @Override public Node visitT_chart_fieldRenameInstruction(DPLParser.T_chart_fieldRenameInstructionContext ctx) { return chartTransformation.visitT_chart_fieldRenameInstruction(ctx); @@ -720,15 +732,21 @@ public Node visitEvalStringType(DPLParser.EvalStringTypeContext ctx) { return evalStatement.visitEvalStringType(ctx); } - public Node visitL_evalStatement_evalCalculateStatement_multipliers(DPLParser.L_evalStatement_evalCalculateStatement_multipliersContext ctx) { + public Node visitL_evalStatement_evalCalculateStatement_multipliers( + DPLParser.L_evalStatement_evalCalculateStatement_multipliersContext ctx + ) { return evalStatement.visitL_evalStatement_evalCalculateStatement_multipliers(ctx); } - public Node visitL_evalStatement_evalCalculateStatement_minus_plus(DPLParser.L_evalStatement_evalCalculateStatement_minus_plusContext ctx) { + public Node visitL_evalStatement_evalCalculateStatement_minus_plus( + DPLParser.L_evalStatement_evalCalculateStatement_minus_plusContext ctx + ) { return evalStatement.visitL_evalStatement_evalCalculateStatement_minus_plus(ctx); } - public Node visitL_evalStatement_evalConcatenateStatement(DPLParser.L_evalStatement_evalConcatenateStatementContext ctx) { + public Node visitL_evalStatement_evalConcatenateStatement( + DPLParser.L_evalStatement_evalConcatenateStatementContext ctx + ) { return evalStatement.visitL_evalStatement_evalConcatenateStatement(ctx); } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/LogicalStatementXML.java b/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/LogicalStatementXML.java index 90161f2..97e256c 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/LogicalStatementXML.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/LogicalStatementXML.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.logicalstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -68,17 +67,21 @@ import java.util.stream.Collectors; /** - *

Contains the visitor functions for logicalStatement, which is used - * for the main search function of the language.

- *

These functions help to build the necessary archive query and Spark actions.

- * Example: - *
index=cinnamon earliest=-1y latest=-1d
- *

After the main logicalStatement, multiple - * {@link com.teragrep.pth10.ast.commands.transformstatement.TransformStatement transformStatements} - * that contain aggregations and other functions can be chained, or left unused if the user wants - * to perform a basic search.

+ *

+ * Contains the visitor functions for logicalStatement, which is used for the main search function of the language. + *

+ *

+ * These functions help to build the necessary archive query and Spark actions. + *

+ * Example:
index=cinnamon earliest=-1y latest=-1d
+ *

+ * After the main logicalStatement, multiple + * {@link com.teragrep.pth10.ast.commands.transformstatement.TransformStatement transformStatements} that contain + * aggregations and other functions can be chained, or left unused if the user wants to perform a basic search. + *

*/ public class LogicalStatementXML extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(LogicalStatementXML.class); private final DPLParserCatalystContext catCtx; private final boolean isMetadataQuery; @@ -97,8 +100,9 @@ public LogicalStatementXML(DPLParserCatalystContext catCtx, Document doc, boolea } /** - * Visits the parse tree for SearchTransformationRoot and returns a LogicalXMLStep that can be added to - * Steptree in DPLParserCatalystVisitor. + * Visits the parse tree for SearchTransformationRoot and returns a LogicalXMLStep that can be added to Steptree in + * DPLParserCatalystVisitor. + * * @param ctx SearchTransformationRootContext * @return LogicalXMLStep */ @@ -113,18 +117,22 @@ public AbstractStep visitLogicalStatementXML(DPLParser.SearchTransformationRootC } /** - * The main visitor function for searchTransformation, used for the main search function. - *
+     * The main visitor function for searchTransformation, used for the main search function. 
      *     root : searchTransformationRoot transformStatement?
      *     searchTransformationRoot : logicalStatement
      * 
+ * * @param ctx SearchTransformationRoot context * @return logicalStatement columnNode */ @Override public Node visitSearchTransformationRoot(DPLParser.SearchTransformationRootContext ctx) { ElementNode archiveQuery; - LOGGER.info("[SearchTransformationRoot XML] Visiting: <{}> with <{}> children", ctx.getText(), ctx.getChildCount()); + LOGGER + .info( + "[SearchTransformationRoot XML] Visiting: <{}> with <{}> children", ctx.getText(), + ctx.getChildCount() + ); if (ctx.getChildCount() == 1) { // just a single directoryStatement -or- logicalStatement @@ -132,8 +140,9 @@ public Node visitSearchTransformationRoot(DPLParser.SearchTransformationRootCont } else { ParseTree secondChild = ctx.getChild(1); - if (secondChild instanceof TerminalNode && - ((TerminalNode) secondChild).getSymbol().getType() == DPLLexer.OR) { + if ( + secondChild instanceof TerminalNode && ((TerminalNode) secondChild).getSymbol().getType() == DPLLexer.OR + ) { // case: directoryStmt OR logicalStmt ElementNode dirStatArchiveQuery = (ElementNode) visit(ctx.directoryStatement()); ElementNode logiStatArchiveQuery = (ElementNode) visit(ctx.logicalStatement(0)); @@ -157,7 +166,8 @@ public Node visitSearchTransformationRoot(DPLParser.SearchTransformationRootCont if (firstLogicalStmt) { andElem.appendChild(logiStatArchiveQuery.getElement()); firstLogicalStmt = false; - } else { + } + else { Element newAndElem = doc.createElement("AND"); newAndElem.appendChild(andElem); newAndElem.appendChild(logiStatArchiveQuery.getElement()); @@ -169,8 +179,8 @@ public Node visitSearchTransformationRoot(DPLParser.SearchTransformationRootCont } } - - if (archiveQuery != null) this.catCtx.setArchiveQuery(archiveQuery.toString()); + if (archiveQuery != null) + this.catCtx.setArchiveQuery(archiveQuery.toString()); LOGGER.info("XML archive query: <{}>", archiveQuery); return archiveQuery; @@ -197,8 +207,9 @@ public Node visitDirectoryStatement(DPLParser.DirectoryStatementContext ctx) { ParseTree secondChild = ctx.getChild(1); // check if directoryStmt OR directoryStmt - if (secondChild instanceof TerminalNode && - ((TerminalNode) secondChild).getSymbol().getType() == DPLLexer.OR) { + if ( + secondChild instanceof TerminalNode && ((TerminalNode) secondChild).getSymbol().getType() == DPLLexer.OR + ) { LOGGER.debug("[DirStmt] OR detected"); orMode = true; } @@ -225,7 +236,8 @@ public Node visitDirectoryStatement(DPLParser.DirectoryStatementContext ctx) { if (n instanceof SubSearchNode) { SubSearchNode ssn = (SubSearchNode) n; el.appendChild(ssn.asElement(doc)); - } else { + } + else { Element e = ((ElementNode) n).getElement(); el.appendChild(e); } @@ -271,7 +283,8 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { // Visit leftmost child if it is not a terminal node if (!(ctx.getChild(0) instanceof TerminalNode)) { left = visit(ctx.getChild(0)); - } else { + } + else { // TerminalNode can only be "NOT" leftIsTerminal = (TerminalNode) ctx.getChild(0); if (leftIsTerminal.getSymbol().getType() != DPLLexer.NOT) { @@ -283,7 +296,8 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { if (ctx.getChildCount() == 1) { // leaf rv = left; - } else if (ctx.getChildCount() == 2) { + } + else if (ctx.getChildCount() == 2) { // Two children, visit rightmost child Node right = visit(ctx.getChild(1)); Element el; @@ -291,7 +305,8 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { if (leftIsTerminal != null) { // Should be NOT el = doc.createElement(leftIsTerminal.getText().toUpperCase()); - } else { + } + else { // Add missing AND between elements el = doc.createElement("AND"); } @@ -305,7 +320,8 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { rv = new ElementNode(el); - } else if (ctx.getChildCount() == 3) { + } + else if (ctx.getChildCount() == 3) { // Three children; logicalStmt AND/OR logicalStmt TerminalNode operation = (TerminalNode) ctx.getChild(1); Node right = visit(ctx.getChild(2)); @@ -320,7 +336,7 @@ public Node visitLogicalStatement(DPLParser.LogicalStatementContext ctx) { if (rv instanceof SubSearchNode) { LOGGER.info("[XML] [LogiStat] Return value was SubsearchNode. Converting to ElementNode!"); - return new ElementNode(((SubSearchNode)rv).asElement(doc)); + return new ElementNode(((SubSearchNode) rv).asElement(doc)); } LOGGER.debug("visitLogicalStatement outgoing: <{}>", rv); @@ -358,11 +374,10 @@ public Node visitSublogicalStatement(DPLParser.SublogicalStatementContext ctx) { } /** - * searchQualifier : INDEX (EQ|NEQ) stringType WILDCARD? | SOURCETYPE (EQ|NEQ) - * stringType WILDCARD? | HOST (EQ|NEQ) stringType WILDCARD? | SOURCE (EQ|NEQ) - * stringType WILDCARD? | SAVEDSEARCH (EQ|NEQ) stringType WILDCARD? | EVENTTYPE - * (EQ|NEQ) stringType WILDCARD? | EVENTTYPETAG (EQ|NEQ) stringType WILDCARD? | - * HOSTTAG (EQ|NEQ) stringType WILDCARD? | TAG (EQ|NEQ) stringType WILDCARD? ; + * searchQualifier : INDEX (EQ|NEQ) stringType WILDCARD? | SOURCETYPE (EQ|NEQ) stringType WILDCARD? | HOST (EQ|NEQ) + * stringType WILDCARD? | SOURCE (EQ|NEQ) stringType WILDCARD? | SAVEDSEARCH (EQ|NEQ) stringType WILDCARD? | + * EVENTTYPE (EQ|NEQ) stringType WILDCARD? | EVENTTYPETAG (EQ|NEQ) stringType WILDCARD? | HOSTTAG (EQ|NEQ) + * stringType WILDCARD? | TAG (EQ|NEQ) stringType WILDCARD? ; */ @Override public Node visitSearchQualifier(DPLParser.SearchQualifierContext ctx) { @@ -379,7 +394,8 @@ public Node visitSearchQualifier(DPLParser.SearchQualifierContext ctx) { // check whether operation is '=' or '!=' if (operation.getSymbol().getType() == DPLLexer.EQ) { comparisonToken = new Token(Type.EQUALS); - } else { + } + else { comparisonToken = new Token(Type.NOT_EQUALS); } @@ -408,7 +424,8 @@ else if (left.getSymbol().getType() == DPLLexer.SOURCETYPE) { // other column=value qualifier value = new UnquotedText(new TextString(ctx.getChild(2).getText().toLowerCase())).read(); el = doc.createElement(ctx.getChild(0).getText().toLowerCase()); - LOGGER.debug("custom qualifier: field=<{}> = value=<{}>", ctx.getChild(0).getText(), ctx.getChild(2).getText()); + LOGGER + .debug("custom qualifier: field=<{}> = value=<{}>", ctx.getChild(0).getText(), ctx.getChild(2).getText()); } if (listOfIndices.isEmpty()) { @@ -438,7 +455,6 @@ else if (i < 2) { el = outerOR; } - } } @@ -488,9 +504,9 @@ public Node visitIndexStatement(DPLParser.IndexStatementContext ctx) { /** * {@inheritDoc} - * - *

The default implementation returns the result of calling - * {@link #visitChildren} on {@code ctx}.

+ *

+ * The default implementation returns the result of calling {@link #visitChildren} on {@code ctx}. + *

*/ @Override public Node visitComparisonStatement(DPLParser.ComparisonStatementContext ctx) { diff --git a/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/TimeStatement.java b/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/TimeStatement.java index 6f17999..4a17eb0 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/TimeStatement.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/TimeStatement.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.logicalstatement; import com.teragrep.pth10.ast.*; @@ -65,13 +64,14 @@ import java.sql.Timestamp; import java.text.ParseException; -import java.util.Stack; /** - *

A subrule of logicalStatement, used for statements of time such as - * earliest, latest, and et cetera.

+ *

+ * A subrule of logicalStatement, used for statements of time such as earliest, latest, and et cetera. + *

*/ public class TimeStatement extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(TimeStatement.class); private final Document doc; @@ -121,13 +121,10 @@ public Node visitTimeStatement(DPLParser.TimeStatementContext ctx) { /** * {@inheritDoc} - * *

- * The default implementation returns the result of calling - * {@link #visitChildren} on {@code ctx}. + * The default implementation returns the result of calling {@link #visitChildren} on {@code ctx}. *

- * timeFormatQualifier : TIMEFORMAT EQ stringType //FIXME implement Time - * Properties ; + * timeFormatQualifier : TIMEFORMAT EQ stringType //FIXME implement Time Properties ; */ @Override public Node visitTimeFormatQualifier(DPLParser.TimeFormatQualifierContext ctx) { @@ -137,12 +134,9 @@ public Node visitTimeFormatQualifier(DPLParser.TimeFormatQualifierContext ctx) { /** * {@inheritDoc} - * *

- * The default implementation returns the result of calling - * {@link #visitChildren} on {@code ctx}. + * The default implementation returns the result of calling {@link #visitChildren} on {@code ctx}. *

- * *
      * timeQualifier
      * : EARLIEST EQ stringType
@@ -176,11 +170,11 @@ public Node visitTimeFormatQualifier(DPLParser.TimeFormatQualifierContext ctx) {
     public Node visitTimeQualifier(DPLParser.TimeQualifierContext ctx) {
         Node rv = null;
         switch (mode) {
-            case XML:{
+            case XML: {
                 rv = timeQualifierEmitXml(ctx);
                 break;
             }
-            case CATALYST:{
+            case CATALYST: {
                 rv = timeQualifierEmitCatalyst(ctx);
                 break;
             }
@@ -189,9 +183,10 @@ public Node visitTimeQualifier(DPLParser.TimeQualifierContext ctx) {
     }
 
     /**
-     * Returns an ElementNode with {@literal LE(<=) or GE(>=)} of unix time used
-     * to restrict search results to certain timeframe. 
+ * Returns an ElementNode with {@literal LE(<=) or GE(>=)} of unix time used to restrict search results to certain + * timeframe.
* Supports EARLIEST, INDEX_EARLIEST, LATEST, INDEX_LATEST + * * @param ctx * @return ElementNode(XML) with LE/GE unixtime */ @@ -214,7 +209,8 @@ private ElementNode timeQualifierEmitXml(DPLParser.TimeQualifierContext ctx) { // relative time RelativeTimestamp rtTimestamp = rtParser.parse(value); // might throw NFE if not relative timestamp timevalue = rtTimestamp.calculate(now); - } catch (NumberFormatException ne) { + } + catch (NumberFormatException ne) { // absolute time timevalue = this.getEpochFromString(value, catCtx.getTimeFormatString()); } @@ -257,9 +253,10 @@ private ElementNode timeQualifierEmitXml(DPLParser.TimeQualifierContext ctx) { } /** - * Returns a ColumnNode containing a column with {@literal LEQ(<=) or GEQ(>=)} of unix time used - * to restrict search results to certain timeframe.
+ * Returns a ColumnNode containing a column with {@literal LEQ(<=) or GEQ(>=)} of unix time used to restrict search + * results to certain timeframe.
* Supports EARLIEST, INDEX_EARLIEST, LATEST, INDEX_LATEST + * * @param ctx * @return ColumnNode with leq/geq unixtime */ @@ -283,7 +280,8 @@ private ColumnNode timeQualifierEmitCatalyst(DPLParser.TimeQualifierContext ctx) // relative time RelativeTimestamp rtTimestamp = rtParser.parse(value); timevalue = rtTimestamp.calculate(now); - } catch (NumberFormatException ne) { + } + catch (NumberFormatException ne) { // absolute time timevalue = this.getEpochFromString(value, catCtx.getTimeFormatString()); } @@ -321,14 +319,16 @@ private long getEpochFromString(String value, String timeFormatString) { long timevalue = 0; if (timeFormatString == null || timeFormatString.equals("")) { timevalue = new DefaultTimeFormat().getEpoch(value); - } else { + } + else { // TODO: should be included in DPLTimeFormat if (timeFormatString.equals("%s")) { return Long.parseLong(value); } try { timevalue = new DPLTimeFormat(timeFormatString).getEpoch(value); - } catch (ParseException e) { + } + catch (ParseException e) { throw new RuntimeException("TimeQualifier conversion error: <" + value + "> can't be parsed."); } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/UDFs/SearchComparison.java b/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/UDFs/SearchComparison.java index 8dd1efa..1d590dc 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/UDFs/SearchComparison.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/logicalstatement/UDFs/SearchComparison.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.logicalstatement.UDFs; import com.teragrep.jue_01.GlobToRegEx; @@ -60,6 +59,7 @@ * UDF for comparing a field to a value in a search command. */ public class SearchComparison implements UDF3 { + @Override public Boolean call(Object l, Integer operationType, Object r) throws Exception { // Parse in case a number has been set to a String @@ -81,19 +81,23 @@ public Boolean call(Object l, Integer operationType, Object r) throws Exception // get left as string if (leftType == ParsedResult.Type.STRING) { leftString = left.getString(); - } else if (leftType == ParsedResult.Type.DOUBLE) { + } + else if (leftType == ParsedResult.Type.DOUBLE) { leftString = BigDecimal.valueOf(left.getDouble()).toPlainString(); - } else { + } + else { leftString = BigDecimal.valueOf(left.getLong()).toPlainString(); } // get right as string if (rightType == ParsedResult.Type.STRING) { rightString = right.getString(); - } else if (rightType == ParsedResult.Type.DOUBLE) { + } + else if (rightType == ParsedResult.Type.DOUBLE) { // make into a string through BigDecimal to get it exactly as written in the command rightString = BigDecimal.valueOf(right.getDouble()).stripTrailingZeros().toPlainString(); - } else { + } + else { rightString = BigDecimal.valueOf(right.getLong()).stripTrailingZeros().toPlainString(); } @@ -142,8 +146,10 @@ public Boolean call(Object l, Integer operationType, Object r) throws Exception } } else { - BigDecimal leftNumber = left.getType() == ParsedResult.Type.DOUBLE ? BigDecimal.valueOf(left.getDouble()) : BigDecimal.valueOf(left.getLong()); - BigDecimal rightNumber = right.getType() == ParsedResult.Type.DOUBLE ? BigDecimal.valueOf(right.getDouble()) : BigDecimal.valueOf(right.getLong()); + BigDecimal leftNumber = left.getType() == ParsedResult.Type.DOUBLE ? BigDecimal + .valueOf(left.getDouble()) : BigDecimal.valueOf(left.getLong()); + BigDecimal rightNumber = right.getType() == ParsedResult.Type.DOUBLE ? BigDecimal + .valueOf(right.getDouble()) : BigDecimal.valueOf(right.getLong()); switch (operationType) { case DPLLexer.EQ: diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/AddtotalsTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/AddtotalsTransformation.java index 7cbbbe9..d07b4d3 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/AddtotalsTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/AddtotalsTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -59,6 +59,7 @@ import java.util.List; public class AddtotalsTransformation extends DPLParserBaseVisitor { + private final DPLParserCatalystContext catCtx; private AddtotalsStep addtotalsStep; private boolean row; @@ -97,9 +98,11 @@ public Node visitFieldListType(DPLParser.FieldListTypeContext ctx) { public Node visitT_addtotals_rowParameter(DPLParser.T_addtotals_rowParameterContext ctx) { if (ctx.booleanType().GET_BOOLEAN_FALSE() != null) { row = false; - } else if (ctx.booleanType().GET_BOOLEAN_TRUE() != null) { + } + else if (ctx.booleanType().GET_BOOLEAN_TRUE() != null) { row = true; - } else { + } + else { throw new IllegalArgumentException("Unexpected 'row' booleanType: " + ctx.getText()); } return new NullNode(); @@ -109,9 +112,11 @@ public Node visitT_addtotals_rowParameter(DPLParser.T_addtotals_rowParameterCont public Node visitT_addtotals_colParameter(DPLParser.T_addtotals_colParameterContext ctx) { if (ctx.booleanType().GET_BOOLEAN_FALSE() != null) { col = false; - } else if (ctx.booleanType().GET_BOOLEAN_TRUE() != null) { + } + else if (ctx.booleanType().GET_BOOLEAN_TRUE() != null) { col = true; - } else { + } + else { throw new IllegalArgumentException("Unexpected 'col' booleanType: " + ctx.getText()); } return new NullNode(); @@ -125,7 +130,7 @@ public Node visitT_addtotals_fieldnameParameter(DPLParser.T_addtotals_fieldnameP @Override public Node visitT_addtotals_labelfieldParameter(DPLParser.T_addtotals_labelfieldParameterContext ctx) { - labelField = new UnquotedText(new TextString(ctx.fieldType().getText())).read(); + labelField = new UnquotedText(new TextString(ctx.fieldType().getText())).read(); return new NullNode(); } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ChartTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ChartTransformation.java index d0d1d62..df52efe 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ChartTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ChartTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.functions.dpf_02.SortByClause; @@ -69,6 +68,7 @@ * Base visitor class for the chart command */ public class ChartTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(ChartTransformation.class); DPLParserCatalystContext catCtx; @@ -79,6 +79,7 @@ public class ChartTransformation extends DPLParserBaseVisitor { /** * Initialize the class to use in TransformStatement + * * @param catCtx catalyst context */ public ChartTransformation(DPLParserCatalystContext catCtx) { @@ -87,13 +88,13 @@ public ChartTransformation(DPLParserCatalystContext catCtx) { this.aggregateFunction = new AggregateFunction(catCtx); } - public String getAggregateField() {return this.aggregateField;} + public String getAggregateField() { + return this.aggregateField; + } /** - * chartTransformation : CHART - * (sepChartParameter|formatParameter|contParameter|limitParameter|aggParameter)* - * (aggregationInstruction|sparklineAggregationInstruction|PARENTHESIS_L * - * evalStatement PARENTHESIS_R)+ + * chartTransformation : CHART (sepChartParameter|formatParameter|contParameter|limitParameter|aggParameter)* + * (aggregationInstruction|sparklineAggregationInstruction|PARENTHESIS_L * evalStatement PARENTHESIS_R)+ * ((overInstruction(divideByInstruction)?)|(divideByInstruction))? ; */ @@ -106,44 +107,54 @@ public Node visitChartTransformation(DPLParser.ChartTransformationContext ctx) { /** * Goes through all the chart command's parameters and performs the aggregation via the stack + * * @param ctx * @return */ private Node visitChartTransformationEmitCatalyst(DPLParser.ChartTransformationContext ctx) { - LOGGER.info("ChartTransformation incoming: text=<{}>", ctx.getText()); - - ArrayList listOfExpr = new ArrayList<>(); - // aggregate function and its field renaming instruction - for (DPLParser.T_chart_aggregationInstructionContext c : ctx.t_chart_aggregationInstruction()) { - // Visit aggregation function - Node aggFunction = visit(c.getChild(0)); - Column aggCol = ((ColumnNode) aggFunction).getColumn(); - - // Get field rename instruction if exists, and apply the rename to aggCol - if (c.t_chart_fieldRenameInstruction() != null) { - String fieldName = visit(c.t_chart_fieldRenameInstruction()).toString(); - aggCol = aggCol.as(fieldName); - } - - // add to list of expressions - listOfExpr.add(aggCol); - } + LOGGER.info("ChartTransformation incoming: text=<{}>", ctx.getText()); + + ArrayList listOfExpr = new ArrayList<>(); + // aggregate function and its field renaming instruction + for (DPLParser.T_chart_aggregationInstructionContext c : ctx.t_chart_aggregationInstruction()) { + // Visit aggregation function + Node aggFunction = visit(c.getChild(0)); + Column aggCol = ((ColumnNode) aggFunction).getColumn(); + + // Get field rename instruction if exists, and apply the rename to aggCol + if (c.t_chart_fieldRenameInstruction() != null) { + String fieldName = visit(c.t_chart_fieldRenameInstruction()).toString(); + aggCol = aggCol.as(fieldName); + } + + // add to list of expressions + listOfExpr.add(aggCol); + } final List listOfGroupBys = new ArrayList<>(); ArrayList listOfSbc = new ArrayList<>(); - // groupBy given column - if (ctx.t_chart_by_column_rowOptions() != null && !ctx.t_chart_by_column_rowOptions().isEmpty()) { + // groupBy given column + if (ctx.t_chart_by_column_rowOptions() != null && !ctx.t_chart_by_column_rowOptions().isEmpty()) { ctx.t_chart_by_column_rowOptions().forEach(opt -> { if (opt.t_column_Parameter() != null && opt.t_column_Parameter().fieldType() != null) { listOfGroupBys.add(functions.col(opt.t_column_Parameter().fieldType().getText())); } if (opt.t_row_Parameter() != null && !opt.t_row_Parameter().fieldType().isEmpty()) { - listOfGroupBys.addAll(opt.t_row_Parameter().fieldType().stream().map(field -> functions.col(field.getText())).collect(Collectors.toList())); - listOfSbc.addAll(opt.t_row_Parameter().fieldType().stream().map(this::createSbc).collect(Collectors.toList())); + listOfGroupBys + .addAll( + opt + .t_row_Parameter() + .fieldType() + .stream() + .map(field -> functions.col(field.getText())) + .collect(Collectors.toList()) + ); + listOfSbc + .addAll(opt.t_row_Parameter().fieldType().stream().map(this::createSbc).collect(Collectors.toList())); } }); - } + } chartStep = new ChartStep(listOfExpr, listOfGroupBys); SortStep sortStep = new SortStep(catCtx, listOfSbc, this.catCtx.getDplRecallSize(), false); @@ -152,24 +163,26 @@ private Node visitChartTransformationEmitCatalyst(DPLParser.ChartTransformationC steps.add(chartStep); steps.add(sortStep); - return new StepListNode(steps); + return new StepListNode(steps); } @Override public Node visitAggregateFunction(DPLParser.AggregateFunctionContext ctx) { Node rv = aggregateFunction.visitAggregateFunction(ctx); - if(aggregateField == null) + if (aggregateField == null) aggregateField = aggregateFunction.getAggregateField(); return rv; } - @Override public Node visitT_row_Parameter(DPLParser.T_row_ParameterContext ctx) { + @Override + public Node visitT_row_Parameter(DPLParser.T_row_ParameterContext ctx) { String target = ctx.getText(); return new StringNode(new Token(Type.STRING, target)); } - @Override public Node visitT_column_Parameter(DPLParser.T_column_ParameterContext ctx) { + @Override + public Node visitT_column_Parameter(DPLParser.T_column_ParameterContext ctx) { String target = ctx.getText(); return new StringNode(new Token(Type.STRING, target)); } @@ -180,17 +193,17 @@ public Node visitT_chart_by_column_rowOptions(List divInsts = new ArrayList<>(); ctxList.forEach(c -> { // grammar: t_row_Parameter? t_column_Parameter? - String f= null; + String f = null; Node rn = null; // Check row-parameter - if(c.t_row_Parameter() != null) { + if (c.t_row_Parameter() != null) { rn = visit(c.t_row_Parameter()); // Node n = visitT_chart_divideByInstruction(c); f = rn.toString(); divInsts.add(f); } // Check also optional column-parameter - if(c.t_column_Parameter() != null) { + if (c.t_column_Parameter() != null) { rn = visit(c.t_column_Parameter()); if (rn != null) { f = rn.toString(); @@ -199,7 +212,6 @@ public Node visitT_chart_by_column_rowOptions(List. + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.TextString; @@ -62,22 +61,23 @@ /** * The 'convert' command is used to convert field values in search results into numerical values.
* Original values are replaced by the new values, unless the 'AS' clause is used.
- * * The command has various functions, that work in different ways:
* auto(x) - convert x automatically using the best conversion
* ctime(x) - convert x (epoch time) to a human readable time, timeformat option can be used to change the format
* dur2sec(x) - convert x (HH:MM:SS) to epoch seconds
* memk(x) - convert x (positive number with g/m/k) to kilobytes
- * mktime(x) - convert x (human readable time) to epoch. Use timeformat to specify exact format from which to convert
+ * mktime(x) - convert x (human readable time) to epoch. Use timeformat to specify exact format from which to + * convert
* mstime(x) - convert x (MM:SS.SSS) to seconds
* none(x) - ignore x fields in conversion functions
* num(x) - works like auto(x), but values that cannot be converted are ignored/removed
* rmcomma(x) - remove commas from x
* rmunit(x) - removes trailing text from x
+ * * @author eemhu - * */ public class ConvertTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(ConvertTransformation.class); public ConvertStep convertStep = null; @@ -94,6 +94,7 @@ public Node visitConvertTransformation(DPLParser.ConvertTransformationContext ct /** * Main visiting function for convert command + * * @param ctx convert command main context * @return step node */ @@ -195,13 +196,11 @@ public Node visitT_convert_fieldRenameInstruction(DPLParser.T_convert_fieldRenam return null; } - - - /* @Override + /* @Override public Node visitStringType(DPLParser.StringTypeContext ctx) { return new StringNode(new Token(Token.Type.STRING, new UnquotedText(new TextString(ctx.getText())).read())); } -*/ + */ private void buildStep(ConvertCommand.ConvertCommandType type, String wcfield) { this.cmd = new ConvertCommand(); this.cmd.setCommandType(type); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/DPLTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/DPLTransformation.java index e6f519e..8d4fe83 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/DPLTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/DPLTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -62,10 +61,10 @@ import java.util.*; /** - * Base transformation for DPL command - * Currently allows only one subcommand, "parsetree" + * Base transformation for DPL command Currently allows only one subcommand, "parsetree" */ public class DPLTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(DPLTransformation.class); DPLParserCatalystContext catCtx = null; @@ -87,7 +86,6 @@ public Node dplTransformationEmitCatalyst(DPLParser.DplTransformationContext ctx String explainStr; List lines = new ArrayList<>(); - explainStr = "dpl"; if (ctx.t_dpl_basefilenameParameter() != null) { String command = ctx.t_dpl_basefilenameParameter().getChild(1).getText(); @@ -110,9 +108,8 @@ public Node dplTransformationEmitCatalyst(DPLParser.DplTransformationContext ctx } } - // FIXME: This has never been functional, seemingly it was supposed to print subsearch parse tree. - /* if(ctx.t_dpl_subsearchParameter()!= null ){ + /* if(ctx.t_dpl_subsearchParameter()!= null ){ //add subsearch result if (ctx.t_dpl_subsearchParameter().getChild(1).getText().equalsIgnoreCase("true")) { if (symbolTable.containsKey("SubsearchParseTree")) { diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/DedupTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/DedupTransformation.java index 617a0cd..1114d3a 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/DedupTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/DedupTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.functions.dpf_02.SortByClause; @@ -65,6 +64,7 @@ import java.util.List; public class DedupTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(DedupTransformation.class); private final DPLParserCatalystContext catCtx; @@ -90,8 +90,10 @@ public Node visitDedupTransformation(DPLParser.DedupTransformationContext ctx) { maxDuplicates = Integer.parseInt(ctx.t_dedup_numberParameter().getText()); } catch (NumberFormatException nfe) { - throw new IllegalArgumentException("Invalid limit parameter value. It must be larger or equal to 1," + - "however within the limits of an IntegerType."); + throw new IllegalArgumentException( + "Invalid limit parameter value. It must be larger or equal to 1," + + "however within the limits of an IntegerType." + ); } if (maxDuplicates < 1) { @@ -121,7 +123,6 @@ public Node visitDedupTransformation(DPLParser.DedupTransformationContext ctx) { for (int i = 0; i < sortByInstCtx.getChildCount(); i++) { ParseTree child = sortByInstCtx.getChild(i); - if (child instanceof DPLParser.T_dedup_sortOrderContext) { if (sbc != null) { // add previous (if any) sortByClause to list @@ -146,7 +147,7 @@ else if (child instanceof DPLParser.T_dedup_sortbyMethodNumContext) { sbc.setFieldName(((DPLParser.T_dedup_sortbyMethodNumContext) child).fieldType().getText()); } else if (child instanceof DPLParser.T_dedup_sortbyMethodStrContext) { - assert sbc != null: "Sort by method STR expected a sort by clause, instead was null"; + assert sbc != null : "Sort by method STR expected a sort by clause, instead was null"; sbc.setSortAsType(SortByClause.Type.STRING); sbc.setFieldName(((DPLParser.T_dedup_sortbyMethodStrContext) child).fieldType().getText()); } @@ -158,21 +159,37 @@ else if (child instanceof DPLParser.T_dedup_sortbyMethodStrContext) { sortStep = new SortStep(catCtx, listOfSortByClauses, this.catCtx.getDplRecallSize(), false); // no support for desc in dedup - LOGGER.info("Processing sortByClauses in dedup with params: sbc={}, limit={}, desc={}", - Arrays.toString(sortStep.getListOfSortByClauses().toArray()), sortStep.getLimit(), sortStep.isDesc()); + LOGGER + .info( + "Processing sortByClauses in dedup with params: sbc={}, limit={}, desc={}", Arrays + .toString(sortStep.getListOfSortByClauses().toArray()), + sortStep.getLimit(), sortStep.isDesc() + ); } // initialize dedupStep here, so the sorted ds will be used if it was set - this.dedupStep = new DedupStep(listOfFields, maxDuplicates, keepEmpty, keepEvents, consecutive, catCtx, sortStep!=null); - - LOGGER.info("Processing dedup with params: limit={}, keepempty={}, keepevents={}, consecutive={}, cols={}", - maxDuplicates, keepEmpty, keepEvents, consecutive, Arrays.toString(listOfFields.toArray())); + this.dedupStep = new DedupStep( + listOfFields, + maxDuplicates, + keepEmpty, + keepEvents, + consecutive, + catCtx, + sortStep != null + ); + + LOGGER + .info( + "Processing dedup with params: limit={}, keepempty={}, keepevents={}, consecutive={}, cols={}", + maxDuplicates, keepEmpty, keepEvents, consecutive, Arrays.toString(listOfFields.toArray()) + ); // only return StepListNode if sort is used as they're two separate step objects (dedup & sort) if (sortStep != null) { return new StepListNode(Arrays.asList(sortStep, this.dedupStep)); - } else { + } + else { return new StepNode(this.dedupStep); } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/EvalTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/EvalTransformation.java index fa35728..05ff37b 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/EvalTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/EvalTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -60,12 +59,12 @@ import java.util.ArrayList; import java.util.List; - /** - * Base transformation for evaluation commands, actual implementations of - * the commands can be found in {@link com.teragrep.pth10.ast.commands.evalstatement.EvalStatement EvalStatement} + * Base transformation for evaluation commands, actual implementations of the commands can be found in + * {@link com.teragrep.pth10.ast.commands.evalstatement.EvalStatement EvalStatement} */ public class EvalTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(EvalTransformation.class); public EvalStatement evalStatement; @@ -95,7 +94,7 @@ private Node evalTransformationEmitCatalyst(DPLParser.EvalTransformationContext } public Node visitEvalFunctionStatement(DPLParser.EvalFunctionStatementContext ctx) { - LOGGER.debug("visitEvalFunctionStatement incoming: text=<{}>", ctx.getText() ); + LOGGER.debug("visitEvalFunctionStatement incoming: text=<{}>", ctx.getText()); return evalStatement.visitEvalFunctionStatement(ctx); } @@ -181,15 +180,21 @@ public Node visitEvalStringType(DPLParser.EvalStringTypeContext ctx) { return evalStatement.visitEvalStringType(ctx); } - public Node visitL_evalStatement_evalCalculateStatement_multipliers(DPLParser.L_evalStatement_evalCalculateStatement_multipliersContext ctx){ + public Node visitL_evalStatement_evalCalculateStatement_multipliers( + DPLParser.L_evalStatement_evalCalculateStatement_multipliersContext ctx + ) { return evalStatement.visitL_evalStatement_evalCalculateStatement_multipliers(ctx); } - public Node visitL_evalStatement_evalCalculateStatement_minus_plus(DPLParser.L_evalStatement_evalCalculateStatement_minus_plusContext ctx){ + public Node visitL_evalStatement_evalCalculateStatement_minus_plus( + DPLParser.L_evalStatement_evalCalculateStatement_minus_plusContext ctx + ) { return evalStatement.visitL_evalStatement_evalCalculateStatement_minus_plus(ctx); } - public Node visitL_evalStatement_evalConcatenateStatement(DPLParser.L_evalStatement_evalConcatenateStatementContext ctx) { + public Node visitL_evalStatement_evalConcatenateStatement( + DPLParser.L_evalStatement_evalConcatenateStatementContext ctx + ) { return evalStatement.visitL_evalStatement_evalConcatenateStatement(ctx); } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/EventstatsTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/EventstatsTransformation.java index 3a08e7a..c91bdd4 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/EventstatsTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/EventstatsTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -62,6 +61,7 @@ import java.util.List; public class EventstatsTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(EventstatsTransformation.class); private DPLParserCatalystContext catCtx; private final String hdfsPath; @@ -121,7 +121,7 @@ public Node visitT_eventstats_aggregationInstruction(DPLParser.T_eventstats_aggr Column aggCol = null; if (cmd instanceof TerminalNode) { - /* if (((TerminalNode)cmd).getSymbol().getType() == DPLLexer.COMMAND_EVENTSTATS_MODE_COUNT) { + /* if (((TerminalNode)cmd).getSymbol().getType() == DPLLexer.COMMAND_EVENTSTATS_MODE_COUNT) { LOGGER.info("Implied wildcard COUNT mode - count({}", ds.columns()[0] + ")"); aggCol = functions.count(ds.columns()[0]).as("count"); }*/ @@ -155,6 +155,7 @@ public Node visitT_eventstats_byInstruction(DPLParser.T_eventstats_byInstruction /** * Doesn't seem to get used, goes through aggregationInstruction->fieldRenameInstruction + * * @param ctx T_eventstats_fieldRenameInstructionContext * @return StringNode with rename field */ diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ExplainTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ExplainTransformation.java index 0ec8c62..3695bda 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ExplainTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ExplainTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -57,20 +56,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.List; - /** - * Base transformation class for explain command. - * Allows to view the Spark physical plan of the dataset + * Base transformation class for explain command. Allows to view the Spark physical plan of the dataset */ public class ExplainTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(ExplainTransformation.class); DPLParserCatalystContext catCtx = null; public ExplainStep explainStep = null; - public ExplainTransformation(DPLParserCatalystContext catCtx) - { + public ExplainTransformation(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; } @@ -82,6 +78,7 @@ public Node visitExplainTransformation(DPLParser.ExplainTransformationContext ct /** * Gets the physical plan and puts in into a dataset to view + * * @param ctx explainTransformationContext * @return catalystnode containing result dataset */ diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FieldsTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FieldsTransformation.java index 2358bbf..12a2e4f 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FieldsTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FieldsTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -55,25 +54,21 @@ import com.teragrep.pth_03.antlr.DPLParserBaseVisitor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; - /** - * Base transformation class for the fields command. - * Allows the user to decide, which fields to retain or drop from the result set. + * Base transformation class for the fields command. Allows the user to decide, which fields to retain or drop from the + * result set. */ public class FieldsTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(FieldsTransformation.class); DPLParserCatalystContext catCtx = null; public FieldsStep fieldsStep = null; - public FieldsTransformation(DPLParserCatalystContext catCtx) - { + public FieldsTransformation(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; } @@ -87,30 +82,33 @@ public Node fieldsTransformationEmitCatalyst(DPLParser.FieldsTransformationConte String oper = ctx.getChild(1).getText(); if ("-".equals(oper)) { - StringListNode sln = (StringListNode)visit(ctx.fieldListType()); + StringListNode sln = (StringListNode) visit(ctx.fieldListType()); LOGGER.debug("Drop fields: stringListNode=<{}>", sln); this.fieldsStep.setMode(AbstractFieldsStep.FieldMode.REMOVE_FIELDS); this.fieldsStep.setListOfFields(sln.asList()); - } else { - StringListNode sln = (StringListNode)visit(ctx.fieldListType()); + } + else { + StringListNode sln = (StringListNode) visit(ctx.fieldListType()); this.fieldsStep.setMode(AbstractFieldsStep.FieldMode.KEEP_FIELDS); this.fieldsStep.setListOfFields(sln.asList()); } return new StepNode(fieldsStep); } - @Override public Node visitFieldListType(DPLParser.FieldListTypeContext ctx) { List fields = new ArrayList<>(); - ctx.children.forEach(f ->{ - // skip non-fieldType children - if (f instanceof DPLParser.FieldTypeContext) { - String fieldType = visit(f).toString(); - fields.add(fieldType); - } - }); + ctx.children + .forEach( + f -> { + // skip non-fieldType children + if (f instanceof DPLParser.FieldTypeContext) { + String fieldType = visit(f).toString(); + fields.add(fieldType); + } + } + ); return new StringListNode(fields); } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FillnullTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FillnullTransformation.java index 500ebaf..27bc8a0 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FillnullTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FillnullTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -45,7 +45,6 @@ */ package com.teragrep.pth10.ast.commands.transformstatement; -import com.teragrep.pth10.ast.DPLParserCatalystContext; import com.teragrep.pth10.ast.NullValue; import com.teragrep.pth10.ast.TextString; import com.teragrep.pth10.ast.UnquotedText; @@ -60,8 +59,10 @@ import java.util.List; public class FillnullTransformation extends DPLParserBaseVisitor { + public final FillnullStep fillnullStep; private final NullValue nullValue; + public FillnullTransformation(NullValue nullValue) { this.nullValue = nullValue; this.fillnullStep = new FillnullStep(); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FormatTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FormatTransformation.java index 7226049..4f2b57e 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FormatTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/FormatTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -58,6 +58,7 @@ import java.util.List; public class FormatTransformation { + public FormatTransformation() { } @@ -69,7 +70,8 @@ public Node visitFormatTransformation(DPLParser.FormatTransformationContext ctx) List rowAndColumnOpts = new ArrayList<>(); if (ctx.t_format_maxresultsParameter() != null) { - maxResults = Integer.parseInt(visitT_format_maxresultsParameter(ctx.t_format_maxresultsParameter()).toString()); + maxResults = Integer + .parseInt(visitT_format_maxresultsParameter(ctx.t_format_maxresultsParameter()).toString()); } if (ctx.t_format_mvSeparatorParameter() != null) { @@ -78,10 +80,16 @@ public Node visitFormatTransformation(DPLParser.FormatTransformationContext ctx) if (ctx.stringType() != null && !ctx.stringType().isEmpty()) { if (ctx.stringType().size() == 6) { - ctx.stringType().forEach(strCtx -> rowAndColumnOpts.add(new UnquotedText(new TextString(strCtx.getText())).read())); - } else { - throw new IllegalArgumentException("All of the row and column options must be specified in the command: " + - "Row prefix, Column prefix, Column separator, Column suffix, Row separator and Row suffix. Only " + ctx.stringType().size() + " out of 6 options were provided."); + ctx + .stringType() + .forEach(strCtx -> rowAndColumnOpts.add(new UnquotedText(new TextString(strCtx.getText())).read())); + } + else { + throw new IllegalArgumentException( + "All of the row and column options must be specified in the command: " + + "Row prefix, Column prefix, Column separator, Column suffix, Row separator and Row suffix. Only " + + ctx.stringType().size() + " out of 6 options were provided." + ); } } @@ -97,7 +105,6 @@ public Node visitFormatTransformation(DPLParser.FormatTransformationContext ctx) formatStep.setMaxResults(maxResults); formatStep.setMvSep(mvSeparator); - return new StepNode(formatStep); } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/IplocationTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/IplocationTransformation.java index c68d195..bb5e620 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/IplocationTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/IplocationTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.*; @@ -63,12 +62,14 @@ * {@literal | iplocation prefix= allfields= lang= } */ public class IplocationTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(IplocationTransformation.class); private static final String DATABASE_PATH_CONFIG_ITEM = "dpl.pth_10.transform.iplocation.db.path"; private final DPLParserCatalystContext catCtx; private final DPLParserCatalystVisitor catVisitor; public IplocationStep iplocationStep = null; + public IplocationTransformation(DPLParserCatalystContext catCtx, DPLParserCatalystVisitor catVisitor) { this.catCtx = catCtx; this.catVisitor = catVisitor; @@ -124,8 +125,10 @@ public Node visitT_iplocation_allFieldsParameter(DPLParser.T_iplocation_allField allFields = false; break; default: - throw new RuntimeException("Invalid boolean type provided for 'allfields' parameter!\nMake sure that" + - " the parameter is followed by 'true' or 'false'."); + throw new RuntimeException( + "Invalid boolean type provided for 'allfields' parameter!\nMake sure that" + + " the parameter is followed by 'true' or 'false'." + ); } this.iplocationStep.setAllFields(allFields); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/JoinTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/JoinTransformation.java index 64dfa6f..73be17c 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/JoinTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/JoinTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.*; @@ -59,291 +58,299 @@ import org.apache.spark.sql.Row; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; import java.util.ArrayList; import java.util.List; import java.util.Objects; /** - * Base transformation class for the join command. - * Allows the user to join two searches (their result sets) together + * Base transformation class for the join command. Allows the user to join two searches (their result sets) together */ public class JoinTransformation extends DPLParserBaseVisitor { - private static final Logger LOGGER = LoggerFactory.getLogger(JoinTransformation.class); - private final DPLParserCatalystContext catCtx; - private final DPLParserCatalystVisitor catVisitor; - private String pathForSubsearchSave = "/tmp/pth_10/join"; - public JoinStep joinStep = null; - - public JoinTransformation(DPLParserCatalystVisitor catVisitor, DPLParserCatalystContext catCtx) { - this.catVisitor = catVisitor; - this.catCtx = catCtx; - } - - /** - * {@literal | join left=L right=R where L.pid = R.pid [subsearch]} - *
- * Command examples:
- * ... | join product_id [search vendors] - * combine results from main search with result from a subsearch "search vendors". Sets are joined on the product_id field. - *
- * ... | join product_id max=0 [search vendors] - * returns all subsearch rows; by default only first one is returned. - *
- * ... | join left=L right=R where L.product.id= - * R.product_id [search vendors] - * combine results from a search with the vendors dataset. - *
- * ... | join left=L right=R where L.product_id= - * R.pid [search vendors] - * different field names - *
- * - * A maximum of 50,000 rows in the right-side dataset can be joined with the left-side dataset - *
COMMAND_MODE_JOIN (t_join_joinOptionsParameter)*? fieldListType? t_join_unnamedDatasetParameter
- */ - @Override - public Node visitJoinTransformation(DPLParser.JoinTransformationContext ctx) { - LOGGER.info("visitJoinTransformation incoming: text=<{}>", ctx.getText()); + + private static final Logger LOGGER = LoggerFactory.getLogger(JoinTransformation.class); + private final DPLParserCatalystContext catCtx; + private final DPLParserCatalystVisitor catVisitor; + private String pathForSubsearchSave = "/tmp/pth_10/join"; + public JoinStep joinStep = null; + + public JoinTransformation(DPLParserCatalystVisitor catVisitor, DPLParserCatalystContext catCtx) { + this.catVisitor = catVisitor; + this.catCtx = catCtx; + } + + /** + * {@literal | join left=L right=R where L.pid = R.pid [subsearch]} + *
+ * Command examples:
+ * ... | join product_id [search vendors] combine results from main search with result from a subsearch + * "search vendors". Sets are joined on the product_id field. + *
+ * ... | join product_id max=0 [search vendors] returns all subsearch rows; by default only first one + * is returned. + *
+ * ... | join left=L right=R where L.product.id= + * R.product_id [search vendors] combine results from a search with the vendors dataset. + *
+ * ... | join left=L right=R where L.product_id= + * R.pid [search vendors] different field names + *
+ * A maximum of 50,000 rows in the right-side dataset can be joined with the left-side dataset + *
COMMAND_MODE_JOIN (t_join_joinOptionsParameter)*? fieldListType? t_join_unnamedDatasetParameter
+ */ + @Override + public Node visitJoinTransformation(DPLParser.JoinTransformationContext ctx) { + LOGGER.info("visitJoinTransformation incoming: text=<{}>", ctx.getText()); return joinTransformationEmitCatalyst(ctx); - } - - /** - * Gets the join options from the command, performs a subsearch - which is saved to HDFS, - * and performs the stream-static join. - * @param ctx - * @return - */ - private Node joinTransformationEmitCatalyst(DPLParser.JoinTransformationContext ctx) { - this.joinStep = new JoinStep(); - - Dataset subSearchDs = null; // Contains the subsearch result dataframe (right side) - List listOfFields = null; // Contains names of all the fields as strings (Java List) - - // Variables used for all the different join options / parameters - String joinMode = "inner"; - Boolean usetime = false; - Boolean earlier = true; - Boolean overwrite = true; - Integer max = 1; - - // Go through all children - for (int i = 0; i < ctx.getChildCount(); i++) { - ParseTree child = ctx.getChild(i); - LOGGER.debug("Child(<{}>) content: <{}>", i, child.getText()); - - if (child instanceof DPLParser.T_join_joinOptionsParameterContext) { - // Get all the different join options / parameters - LOGGER.debug("Child(<{}>) is instanceof join options", i); - - for (int j = 0; j < child.getChildCount(); j++) { - ParseTree joinOptionsChild = child.getChild(j); - - if (joinOptionsChild instanceof DPLParser.T_join_typeParameterContext) { - StringNode typeParam = (StringNode) visit(joinOptionsChild); - joinMode = typeParam.toString(); - } else if (joinOptionsChild instanceof DPLParser.T_join_usetimeParameterContext) { - StringNode usetimeParam = (StringNode) visit(joinOptionsChild); - usetime = (Objects.equals(usetimeParam.toString(), "true")); - } else if (joinOptionsChild instanceof DPLParser.T_join_earlierParameterContext) { - StringNode earlierParam = (StringNode) visit(joinOptionsChild); - earlier = (Objects.equals(earlierParam.toString(), "true")); - } else if (joinOptionsChild instanceof DPLParser.T_join_overwriteParameterContext) { - StringNode overwriteParam = (StringNode) visit(joinOptionsChild); - overwrite = (Objects.equals(overwriteParam.toString(), "true")); - } else if (joinOptionsChild instanceof DPLParser.T_join_maxParameterContext) { - StringNode maxParam = (StringNode) visit(joinOptionsChild); - max = Integer.parseInt(maxParam.toString()); - } - } - } else if (child instanceof DPLParser.T_join_unnamedDatasetParameterContext) { - // perform subsearch - LOGGER.debug("Child <{}> is instanceof dataset parameter", i); - visit(child); - } else if (child instanceof DPLParser.FieldListTypeContext) { - LOGGER.debug("Child <{}> is instanceof fieldlist type", i); - // Visit FieldListType and place fields as Columns in seqOfFields - StringListNode listOfFieldsNode = (StringListNode) visit(child); - - listOfFields = listOfFieldsNode.asList(); - } else if (child instanceof TerminalNode) { - LOGGER.debug("Child <{}> is instanceof terminalnode", i); - // should be COMMAND_JOIN_MODE - // no action needed as it is just a command keyword - } - // everything else is invalid and not processed - } - - LOGGER.debug("--- Join parameters ---"); - LOGGER.debug("join mode= <{}>", joinMode); - LOGGER.debug("usetime= <{}>", usetime); - LOGGER.debug("earlier= <{}>", earlier); - LOGGER.debug("overwrite= <{}>", overwrite); - LOGGER.debug("max= <{}>", max); - LOGGER.debug("-----------------------"); - - this.pathForSubsearchSave = this.catVisitor.getHdfsPath(); - - // step - this.joinStep.setJoinMode(joinMode); - this.joinStep.setEarlier(earlier); - this.joinStep.setMax(max); - this.joinStep.setOverwrite(overwrite); - this.joinStep.setListOfFields(listOfFields); - this.joinStep.setUsetime(usetime); - this.joinStep.setPathForSubsearchSave(pathForSubsearchSave); - this.joinStep.setSubSearchDataset(subSearchDs); - this.joinStep.setCatCtx(catCtx); - - return new StepNode(joinStep); - } - - @Override - public Node visitFieldListType(DPLParser.FieldListTypeContext ctx) { - Node rv = null; - - List fieldList = new ArrayList<>(); - ctx.children.forEach(field -> { - String fieldName = new UnquotedText(new TextString(field.getText())).read(); - - if (!fieldName.equals(",")) { - fieldList.add(fieldName); - } - - }); - - rv = new StringListNode(fieldList); - return rv; - } - - - // COMMAND_JOIN_TYPE (COMMAND_JOIN_GET_TYPE_MODE_OUTER|COMMAND_JOIN_GET_TYPE_MODE_LEFT| - // COMMAND_JOIN_GET_TYPE_MODE_INNER) - @Override - public Node visitT_join_typeParameter(DPLParser.T_join_typeParameterContext ctx) { - Node rv = t_join_typeParameterEmitCatalyst(ctx); - return rv; - } - - private Node t_join_typeParameterEmitCatalyst(DPLParser.T_join_typeParameterContext ctx) { - Node rv = null; - - TerminalNode type = (TerminalNode) ctx.getChild(1); - - rv = new StringNode(new Token(Token.Type.STRING, type.getText())); - return rv; - } - - // COMMAND_JOIN_MODE_USETIME booleanType - @Override - public Node visitT_join_usetimeParameter(DPLParser.T_join_usetimeParameterContext ctx) { - Node rv = t_join_usetimeParameterEmitCatalyst(ctx); - return rv; - } - - private Node t_join_usetimeParameterEmitCatalyst(DPLParser.T_join_usetimeParameterContext ctx) { - Node rv = null; - // COMMAND_JOIN_MODE_USETIME booleanType - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - rv = getBooleanFromTerminalNode(booleanValue); - return rv; - } - - // COMMAND_JOIN_MODE_EARLIER booleanType - @Override - public Node visitT_join_earlierParameter(DPLParser.T_join_earlierParameterContext ctx) { - Node rv = t_join_earlierParameterEmitCatalyst(ctx); - return rv; - } - - private Node t_join_earlierParameterEmitCatalyst(DPLParser.T_join_earlierParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - rv = getBooleanFromTerminalNode(booleanValue); - return rv; - } - - // COMMAND_JOIN_MODE_OVERWRITE booleanType - @Override - public Node visitT_join_overwriteParameter(DPLParser.T_join_overwriteParameterContext ctx) { - Node rv = t_join_overwriteParameterEmitCatalyst(ctx); - return rv; - } - - private Node t_join_overwriteParameterEmitCatalyst(DPLParser.T_join_overwriteParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - rv = getBooleanFromTerminalNode(booleanValue); - return rv; - } - - // COMMAND_JOIN_MODE_MAX integerType - @Override - public Node visitT_join_maxParameter(DPLParser.T_join_maxParameterContext ctx) { - Node rv = t_join_maxParameterEmitCatalyst(ctx); - return rv; - } - - private Node t_join_maxParameterEmitCatalyst(DPLParser.T_join_maxParameterContext ctx) { - Node rv = null; - - TerminalNode integerValue = (TerminalNode) ctx.getChild(1).getChild(0); - String value = integerValue.getText(); - - rv = new StringNode(new Token(Token.Type.STRING, value)); - return rv; - } - - // subsearchStatement: [ PIPE? subsearchTransformStatement ] - @Override - public Node visitT_join_unnamedDatasetParameter(DPLParser.T_join_unnamedDatasetParameterContext ctx) { - Node rv = t_join_unnamedDatasetParameterEmitCatalyst(ctx); - return rv; - } - - private Node t_join_unnamedDatasetParameterEmitCatalyst(DPLParser.T_join_unnamedDatasetParameterContext ctx) { - LOGGER.info("Visiting unnamedDatasetParameter: text=<{}>, with children=<{}>", ctx.getText(), ctx.getChildCount()); - - for (int i = 0; i < ctx.getChildCount(); i++) { - ParseTree child = ctx.getChild(i); - LOGGER.debug("child on unnamedDatasetParam: text=<{}>", child.getText()); - - if (child instanceof DPLParser.SubsearchStatementContext) { - LOGGER.debug("child instanceof SubsearchStmtCtx: text=<{}>", child.getText()); - DPLParserCatalystVisitor ssVisitor = new DPLParserCatalystVisitor(catCtx); - StepNode ssStepNode = (StepNode) ssVisitor.visitSubsearchStatement(((DPLParser.SubsearchStatementContext)child)); - SubsearchStep ssStep = (SubsearchStep) ssStepNode.get(); - - this.joinStep.setSubsearchStep(ssStep); - } - - } - - return null; - } - - - /** - * Converts a TerminalNode containing BooleanType into StringNode with content "true" or "false" - * @param tn TerminalNode containing a BooleanType - * @return StringNode with either value "true" or "false" - */ - private StringNode getBooleanFromTerminalNode(TerminalNode tn) { - String value = ""; - - switch (tn.getSymbol().getType()) { - case DPLLexer.GET_BOOLEAN_TRUE: - value = "true"; - break; - case DPLLexer.GET_BOOLEAN_FALSE: - value = "false"; - break; - } - - return new StringNode(new Token(Token.Type.STRING, value)); - } + } + + /** + * Gets the join options from the command, performs a subsearch - which is saved to HDFS, and performs the + * stream-static join. + * + * @param ctx + * @return + */ + private Node joinTransformationEmitCatalyst(DPLParser.JoinTransformationContext ctx) { + this.joinStep = new JoinStep(); + + Dataset subSearchDs = null; // Contains the subsearch result dataframe (right side) + List listOfFields = null; // Contains names of all the fields as strings (Java List) + + // Variables used for all the different join options / parameters + String joinMode = "inner"; + Boolean usetime = false; + Boolean earlier = true; + Boolean overwrite = true; + Integer max = 1; + + // Go through all children + for (int i = 0; i < ctx.getChildCount(); i++) { + ParseTree child = ctx.getChild(i); + LOGGER.debug("Child(<{}>) content: <{}>", i, child.getText()); + + if (child instanceof DPLParser.T_join_joinOptionsParameterContext) { + // Get all the different join options / parameters + LOGGER.debug("Child(<{}>) is instanceof join options", i); + + for (int j = 0; j < child.getChildCount(); j++) { + ParseTree joinOptionsChild = child.getChild(j); + + if (joinOptionsChild instanceof DPLParser.T_join_typeParameterContext) { + StringNode typeParam = (StringNode) visit(joinOptionsChild); + joinMode = typeParam.toString(); + } + else if (joinOptionsChild instanceof DPLParser.T_join_usetimeParameterContext) { + StringNode usetimeParam = (StringNode) visit(joinOptionsChild); + usetime = (Objects.equals(usetimeParam.toString(), "true")); + } + else if (joinOptionsChild instanceof DPLParser.T_join_earlierParameterContext) { + StringNode earlierParam = (StringNode) visit(joinOptionsChild); + earlier = (Objects.equals(earlierParam.toString(), "true")); + } + else if (joinOptionsChild instanceof DPLParser.T_join_overwriteParameterContext) { + StringNode overwriteParam = (StringNode) visit(joinOptionsChild); + overwrite = (Objects.equals(overwriteParam.toString(), "true")); + } + else if (joinOptionsChild instanceof DPLParser.T_join_maxParameterContext) { + StringNode maxParam = (StringNode) visit(joinOptionsChild); + max = Integer.parseInt(maxParam.toString()); + } + } + } + else if (child instanceof DPLParser.T_join_unnamedDatasetParameterContext) { + // perform subsearch + LOGGER.debug("Child <{}> is instanceof dataset parameter", i); + visit(child); + } + else if (child instanceof DPLParser.FieldListTypeContext) { + LOGGER.debug("Child <{}> is instanceof fieldlist type", i); + // Visit FieldListType and place fields as Columns in seqOfFields + StringListNode listOfFieldsNode = (StringListNode) visit(child); + + listOfFields = listOfFieldsNode.asList(); + } + else if (child instanceof TerminalNode) { + LOGGER.debug("Child <{}> is instanceof terminalnode", i); + // should be COMMAND_JOIN_MODE + // no action needed as it is just a command keyword + } + // everything else is invalid and not processed + } + + LOGGER.debug("--- Join parameters ---"); + LOGGER.debug("join mode= <{}>", joinMode); + LOGGER.debug("usetime= <{}>", usetime); + LOGGER.debug("earlier= <{}>", earlier); + LOGGER.debug("overwrite= <{}>", overwrite); + LOGGER.debug("max= <{}>", max); + LOGGER.debug("-----------------------"); + + this.pathForSubsearchSave = this.catVisitor.getHdfsPath(); + + // step + this.joinStep.setJoinMode(joinMode); + this.joinStep.setEarlier(earlier); + this.joinStep.setMax(max); + this.joinStep.setOverwrite(overwrite); + this.joinStep.setListOfFields(listOfFields); + this.joinStep.setUsetime(usetime); + this.joinStep.setPathForSubsearchSave(pathForSubsearchSave); + this.joinStep.setSubSearchDataset(subSearchDs); + this.joinStep.setCatCtx(catCtx); + + return new StepNode(joinStep); + } + + @Override + public Node visitFieldListType(DPLParser.FieldListTypeContext ctx) { + Node rv = null; + + List fieldList = new ArrayList<>(); + ctx.children.forEach(field -> { + String fieldName = new UnquotedText(new TextString(field.getText())).read(); + + if (!fieldName.equals(",")) { + fieldList.add(fieldName); + } + + }); + + rv = new StringListNode(fieldList); + return rv; + } + + // COMMAND_JOIN_TYPE (COMMAND_JOIN_GET_TYPE_MODE_OUTER|COMMAND_JOIN_GET_TYPE_MODE_LEFT| + // COMMAND_JOIN_GET_TYPE_MODE_INNER) + @Override + public Node visitT_join_typeParameter(DPLParser.T_join_typeParameterContext ctx) { + Node rv = t_join_typeParameterEmitCatalyst(ctx); + return rv; + } + + private Node t_join_typeParameterEmitCatalyst(DPLParser.T_join_typeParameterContext ctx) { + Node rv = null; + + TerminalNode type = (TerminalNode) ctx.getChild(1); + + rv = new StringNode(new Token(Token.Type.STRING, type.getText())); + return rv; + } + + // COMMAND_JOIN_MODE_USETIME booleanType + @Override + public Node visitT_join_usetimeParameter(DPLParser.T_join_usetimeParameterContext ctx) { + Node rv = t_join_usetimeParameterEmitCatalyst(ctx); + return rv; + } + + private Node t_join_usetimeParameterEmitCatalyst(DPLParser.T_join_usetimeParameterContext ctx) { + Node rv = null; + // COMMAND_JOIN_MODE_USETIME booleanType + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + rv = getBooleanFromTerminalNode(booleanValue); + return rv; + } + + // COMMAND_JOIN_MODE_EARLIER booleanType + @Override + public Node visitT_join_earlierParameter(DPLParser.T_join_earlierParameterContext ctx) { + Node rv = t_join_earlierParameterEmitCatalyst(ctx); + return rv; + } + + private Node t_join_earlierParameterEmitCatalyst(DPLParser.T_join_earlierParameterContext ctx) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + rv = getBooleanFromTerminalNode(booleanValue); + return rv; + } + + // COMMAND_JOIN_MODE_OVERWRITE booleanType + @Override + public Node visitT_join_overwriteParameter(DPLParser.T_join_overwriteParameterContext ctx) { + Node rv = t_join_overwriteParameterEmitCatalyst(ctx); + return rv; + } + + private Node t_join_overwriteParameterEmitCatalyst(DPLParser.T_join_overwriteParameterContext ctx) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + rv = getBooleanFromTerminalNode(booleanValue); + return rv; + } + + // COMMAND_JOIN_MODE_MAX integerType + @Override + public Node visitT_join_maxParameter(DPLParser.T_join_maxParameterContext ctx) { + Node rv = t_join_maxParameterEmitCatalyst(ctx); + return rv; + } + + private Node t_join_maxParameterEmitCatalyst(DPLParser.T_join_maxParameterContext ctx) { + Node rv = null; + + TerminalNode integerValue = (TerminalNode) ctx.getChild(1).getChild(0); + String value = integerValue.getText(); + + rv = new StringNode(new Token(Token.Type.STRING, value)); + return rv; + } + + // subsearchStatement: [ PIPE? subsearchTransformStatement ] + @Override + public Node visitT_join_unnamedDatasetParameter(DPLParser.T_join_unnamedDatasetParameterContext ctx) { + Node rv = t_join_unnamedDatasetParameterEmitCatalyst(ctx); + return rv; + } + + private Node t_join_unnamedDatasetParameterEmitCatalyst(DPLParser.T_join_unnamedDatasetParameterContext ctx) { + LOGGER + .info( + "Visiting unnamedDatasetParameter: text=<{}>, with children=<{}>", ctx.getText(), + ctx.getChildCount() + ); + + for (int i = 0; i < ctx.getChildCount(); i++) { + ParseTree child = ctx.getChild(i); + LOGGER.debug("child on unnamedDatasetParam: text=<{}>", child.getText()); + + if (child instanceof DPLParser.SubsearchStatementContext) { + LOGGER.debug("child instanceof SubsearchStmtCtx: text=<{}>", child.getText()); + DPLParserCatalystVisitor ssVisitor = new DPLParserCatalystVisitor(catCtx); + StepNode ssStepNode = (StepNode) ssVisitor + .visitSubsearchStatement(((DPLParser.SubsearchStatementContext) child)); + SubsearchStep ssStep = (SubsearchStep) ssStepNode.get(); + + this.joinStep.setSubsearchStep(ssStep); + } + + } + + return null; + } + + /** + * Converts a TerminalNode containing BooleanType into StringNode with content "true" or "false" + * + * @param tn TerminalNode containing a BooleanType + * @return StringNode with either value "true" or "false" + */ + private StringNode getBooleanFromTerminalNode(TerminalNode tn) { + String value = ""; + + switch (tn.getSymbol().getType()) { + case DPLLexer.GET_BOOLEAN_TRUE: + value = "true"; + break; + case DPLLexer.GET_BOOLEAN_FALSE: + value = "false"; + break; + } + + return new StringNode(new Token(Token.Type.STRING, value)); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/MakeresultsTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/MakeresultsTransformation.java index 6355b86..58b2e91 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/MakeresultsTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/MakeresultsTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -53,8 +52,6 @@ import com.teragrep.pth_03.antlr.DPLParser; import com.teragrep.pth_03.antlr.DPLParserBaseVisitor; import com.teragrep.pth_03.shaded.org.antlr.v4.runtime.tree.TerminalNode; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; @@ -66,8 +63,10 @@ * Generates $count rows with _time column. More columns can be added by setting $annotate=true */ public class MakeresultsTransformation extends DPLParserBaseVisitor { + private final DPLParserCatalystContext catCtx; public MakeresultsStep makeresultsStep = null; + public MakeresultsTransformation(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; } @@ -78,8 +77,8 @@ public Node visitMakeresultsTransformation(DPLParser.MakeresultsTransformationCo } /** - * Sets all the parameters based on the values given on the command, and generates - * a streaming dataset. + * Sets all the parameters based on the values given on the command, and generates a streaming dataset. + * * @param ctx * @return */ @@ -105,14 +104,17 @@ private Node makeresultsTransformationEmitCatalyst(DPLParser.MakeresultsTransfor count = Integer.parseInt(countParameter); if (count < 1 || count > 2_000_000) { // based on local testing >2M causes memory issues and running out of heap space - throw new IllegalArgumentException("Makeresults: Count parameter value must be a positive integer between 1 and 2 000 000."); + throw new IllegalArgumentException( + "Makeresults: Count parameter value must be a positive integer between 1 and 2 000 000." + ); } } else { - throw new IllegalArgumentException("Makeresults: Invalid count parameter value provided! It must be a positive integer between 1 and 2 000 000."); + throw new IllegalArgumentException( + "Makeresults: Invalid count parameter value provided! It must be a positive integer between 1 and 2 000 000." + ); } - } if (ctx.t_makeresults_struckServerGroupParameter() != null) { @@ -125,7 +127,6 @@ private Node makeresultsTransformationEmitCatalyst(DPLParser.MakeresultsTransfor server = ctx.t_makeresults_struckServerParameter().getText(); } - this.makeresultsStep.setAnnotate(annotate); this.makeresultsStep.setServer(server); this.makeresultsStep.setCount(count); @@ -152,5 +153,4 @@ public Node visitT_makeresults_annotateOptParameter(DPLParser.T_makeresults_anno return new StringNode(new Token(Token.Type.STRING, value)); } - } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/PredictTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/PredictTransformation.java index 042669c..aad47da 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/PredictTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/PredictTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.TextString; @@ -66,9 +65,11 @@ import java.util.List; public class PredictTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(PredictTransformation.class); public PredictStep predictStep = null; + public PredictTransformation() { } @@ -89,16 +90,15 @@ public Node visitPredictTransformation(DPLParser.PredictTransformationContext ct String upperField = null; String lowerField = null; - for (int i = 0; i < ctx.getChildCount(); i++) { ParseTree child = ctx.getChild(i); if (child instanceof DPLParser.FieldTypeContext) { - ParseTree nextChild = ctx.getChild(i+1); + ParseTree nextChild = ctx.getChild(i + 1); // AS if (nextChild instanceof DPLParser.T_predict_fieldRenameInstructionContext) { - listOfColumnsToPredict.add(functions.col(new UnquotedText(new TextString(child.getText())).read()) - .as(new UnquotedText(new TextString(nextChild.getChild(1).getText())).read())); + listOfColumnsToPredict + .add(functions.col(new UnquotedText(new TextString(child.getText())).read()).as(new UnquotedText(new TextString(nextChild.getChild(1).getText())).read())); i++; // Skip next child, as it would be the same fieldRenameInstruction again } // @@ -132,19 +132,23 @@ else if (child instanceof DPLParser.T_predict_pdAlgoOptionParameterContext) { } } else if (child instanceof DPLParser.T_predict_pdCorrelateOptionParameterContext) { - DPLParser.FieldTypeContext ftCtx = ((DPLParser.T_predict_pdCorrelateOptionParameterContext)child).fieldType(); + DPLParser.FieldTypeContext ftCtx = ((DPLParser.T_predict_pdCorrelateOptionParameterContext) child) + .fieldType(); correlateField = new UnquotedText(new TextString(ftCtx.getText())).read(); } else if (child instanceof DPLParser.T_predict_pdFutureTimespanOptionParameterContext) { - DPLParser.NumberTypeContext ntCtx = ((DPLParser.T_predict_pdFutureTimespanOptionParameterContext)child).numberType(); + DPLParser.NumberTypeContext ntCtx = ((DPLParser.T_predict_pdFutureTimespanOptionParameterContext) child) + .numberType(); futureTimespan = Integer.parseInt(ntCtx.getText()); } else if (child instanceof DPLParser.T_predict_pdHoldbackOptionParameterContext) { - DPLParser.NumberTypeContext ntCtx = ((DPLParser.T_predict_pdHoldbackOptionParameterContext)child).numberType(); + DPLParser.NumberTypeContext ntCtx = ((DPLParser.T_predict_pdHoldbackOptionParameterContext) child) + .numberType(); holdback = Integer.parseInt(ntCtx.getText()); } else if (child instanceof DPLParser.T_predict_pdPeriodOptionParameterContext) { - DPLParser.NumberTypeContext ntCtx = ((DPLParser.T_predict_pdPeriodOptionParameterContext)child).numberType(); + DPLParser.NumberTypeContext ntCtx = ((DPLParser.T_predict_pdPeriodOptionParameterContext) child) + .numberType(); period = Integer.parseInt(ntCtx.getText()); } else if (child instanceof DPLParser.T_predict_pdUpperOptionParameterContext) { @@ -158,7 +162,11 @@ else if (child instanceof DPLParser.T_predict_pdLowerOptionParameterContext) { lowerField = new UnquotedText(new TextString(lopCtx.fieldType().getText())).read(); } else if (child instanceof DPLParser.T_predict_pdSuppressOptionParameterContext) { - suppressField = new UnquotedText(new TextString(((DPLParser.T_predict_pdSuppressOptionParameterContext)child).fieldType().getText())).read(); + suppressField = new UnquotedText( + new TextString( + ((DPLParser.T_predict_pdSuppressOptionParameterContext) child).fieldType().getText() + ) + ).read(); } else if (child instanceof TerminalNode) { // skip TerminalNode diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RangemapTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RangemapTransformation.java index a67b598..75e2b08 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RangemapTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RangemapTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -59,8 +59,10 @@ import java.util.HashMap; public class RangemapTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(RangemapTransformation.class); private final RangemapStep rangemapStep; + public RangemapTransformation() { this.rangemapStep = new RangemapStep(); this.rangemapStep.attributeRangeMap = new HashMap<>(); @@ -85,14 +87,20 @@ public Node visitT_rangemap_fieldParameter(DPLParser.T_rangemap_fieldParameterCo public Node visitT_rangemap_attrnParameter(DPLParser.T_rangemap_attrnParameterContext ctx) { final String key = ctx.stringType().getText(); String valueLeft = ctx.t_rangemap_rangeParameter().GET_RANGE_NUMBER_LEFT().getText(); - final String valueRight = ctx.t_rangemap_rangeParameter().t_rangemap_rangeRightParameter().GET_RANGE_NUMBER_RIGHT().getText(); + final String valueRight = ctx + .t_rangemap_rangeParameter() + .t_rangemap_rangeRightParameter() + .GET_RANGE_NUMBER_RIGHT() + .getText(); // left side of range contains a trailing '-' character which needs to be removed if (valueLeft.endsWith("-")) { valueLeft = valueLeft.substring(0, valueLeft.length() - 1); } - this.rangemapStep.attributeRangeMap.put(key, new String[]{valueLeft, valueRight}); + this.rangemapStep.attributeRangeMap.put(key, new String[] { + valueLeft, valueRight + }); return new NullNode(); } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RegexTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RegexTransformation.java index 2f6487d..180efdc 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RegexTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RegexTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.bo.Node; @@ -61,6 +60,7 @@ * Base visitor class for command regex */ public class RegexTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(RegexTransformation.class); public RegexStep regexStep = null; @@ -72,7 +72,7 @@ public RegexTransformation() { (root (searchTransformationRoot (logicalStatement (searchQualifier index = (stringType index_A)))) (transformStatement | (regexTransformation regex (fieldType _raw) != (regexStringType "data data")) (transformStatement ))) - + (root (searchTransformationRoot (logicalStatement (searchQualifier index = (stringType index_A)))) (transformStatement | (regexTransformation regex (fieldType _raw) = (regexStringType "data data")) (transformStatement ))) @@ -104,10 +104,10 @@ public Node visitRegexTransformation(DPLParser.RegexTransformationContext ctx) { ParseTree eq = ctx.getChild(2); LOGGER.debug(eq.getText()); if (eq instanceof TerminalNode) { - if (((TerminalNode)eq).getSymbol().getType() == DPLLexer.COMMAND_REGEX_MODE_EQ) { + if (((TerminalNode) eq).getSymbol().getType() == DPLLexer.COMMAND_REGEX_MODE_EQ) { equals = true; } - else if (((TerminalNode)eq).getSymbol().getType() == DPLLexer.COMMAND_REGEX_MODE_NEQ) { + else if (((TerminalNode) eq).getSymbol().getType() == DPLLexer.COMMAND_REGEX_MODE_NEQ) { equals = false; } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RenameTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RenameTransformation.java index feb6b78..0c7588f 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RenameTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RenameTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.TextString; @@ -66,9 +65,11 @@ *
{@literal ... | rename  AS }
*/ public class RenameTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(RenameTransformation.class); public RenameStep renameStep = null; + public RenameTransformation() { } @@ -107,7 +108,8 @@ else if (child instanceof TerminalNode) { if (originalName != null && newName != null) { // rename the column based on original and new name - mapOfRenamedFields.put(new UnquotedText(new TextString(originalName)).read(), new UnquotedText(new TextString(newName)).read()); + mapOfRenamedFields + .put(new UnquotedText(new TextString(originalName)).read(), new UnquotedText(new TextString(newName)).read()); originalName = null; newName = null; } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ReplaceTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ReplaceTransformation.java index 042571a..c72665b 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ReplaceTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/ReplaceTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.TextString; @@ -67,6 +66,7 @@ * The base transformation class used for the command replace */ public class ReplaceTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(ReplaceTransformation.class); private final Map replacements; private final List listOfFields; @@ -79,6 +79,7 @@ public ReplaceTransformation() { /** * Gets all the parameters from given command and applies the {@link ReplaceCmd} + * * @param ctx ReplaceTransformationContext * @return StepNode with Step for replace command */ @@ -103,6 +104,7 @@ public Node visitT_replace_withInstruction(DPLParser.T_replace_withInstructionCo /** * Gets the list of field names from the parse tree / command + * * @param ctx fieldList context * @return NullNode */ diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RexTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RexTransformation.java index 72bddd4..0dc96b6 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RexTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/RexTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -58,6 +57,7 @@ import org.slf4j.LoggerFactory; public class RexTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(RexTransformation.class); private DPLParserCatalystContext catCtx; @@ -81,7 +81,9 @@ public Node visitRexTransformation(DPLParser.RexTransformationContext ctx) { regexStr = new UnquotedText(new TextString(ctx.regexStringType().getText())).read(); } else { - throw new IllegalArgumentException("Either a sed-style string or a regex extraction string is required to be provided in the command, depending on the selected mode."); + throw new IllegalArgumentException( + "Either a sed-style string or a regex extraction string is required to be provided in the command, depending on the selected mode." + ); } if (ctx.t_rex_fieldParameter() != null) { @@ -99,7 +101,8 @@ public Node visitRexTransformation(DPLParser.RexTransformationContext ctx) { } if (ctx.t_rex_offsetFieldParameter() != null) { - offsetFieldParam = new UnquotedText(new TextString(ctx.t_rex_offsetFieldParameter().stringType().getText())).read(); + offsetFieldParam = new UnquotedText(new TextString(ctx.t_rex_offsetFieldParameter().stringType().getText())) + .read(); } LOGGER.debug("regexStr= <{}>", regexStr); @@ -118,5 +121,4 @@ public Node visitRexTransformation(DPLParser.RexTransformationContext ctx) { return new StepNode(rexStep); } - } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SearchTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SearchTransformation.java index e94b458..8f4ecdf 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SearchTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SearchTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -56,16 +55,15 @@ import org.slf4j.LoggerFactory; /** - * Base transformation for the 'search' command - *
| 'search' logicalStatement 
+ * Base transformation for the 'search' command
| 'search' logicalStatement 
*/ public class SearchTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(SearchTransformation.class); public final SearchStep searchStep; private final DPLParserCatalystContext catCtx; - public SearchTransformation(DPLParserCatalystContext catCtx) - { + public SearchTransformation(DPLParserCatalystContext catCtx) { this.searchStep = new SearchStep(); this.catCtx = catCtx; } @@ -93,7 +91,9 @@ public Node searchTransformationEmitCatalyst(DPLParser.SearchTransformationConte } else { - throw new IllegalStateException("Invalid search command. Expected SearchTransformationRoot, instead got '" + ctx.getText() + "'"); + throw new IllegalStateException( + "Invalid search command. Expected SearchTransformationRoot, instead got '" + ctx.getText() + "'" + ); } return new StepNode(this.searchStep); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SendemailTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SendemailTransformation.java index c82b39b..5dcc13e 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SendemailTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SendemailTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -62,768 +61,812 @@ import org.slf4j.LoggerFactory; /** - * Base transformation class for the 'sendemail' command - *
... | sendemail to=...
+ * Base transformation class for the 'sendemail' command
... | sendemail to=...
*/ public class SendemailTransformation extends DPLParserBaseVisitor { - private static final Logger LOGGER = LoggerFactory.getLogger(SendemailTransformation.class); - private DPLParserCatalystContext catCtx = null; - public SendemailStep sendemailStep = null; - - - public SendemailTransformation(DPLParserCatalystContext catCtx) { - this.catCtx = catCtx; - } - - /**
-	 * Command info:
-	 * Generates email notifications. Email search results to specified email addresses.
-	 * SMTP server needs to be available to send email.
-	 * 
-	 * Syntax:
-	 * sendemail
-	 * 	to= list of emails [REQUIRED, other params optional]
-	 *  cc= list of emails
-	 *  bcc= list of emails
-	 *  subject= string
-	 *  format= csv|table|raw
-	 *  inline= bool
-	 *  sendresults= bool
-	 *  sendpdf= bool
-	 *  priority= highest|high|normal|low|lowest
-	 *  server= string
-	 *  width_sort_columns= bool
-	 *  graceful= bool
-	 *  content_type= html | plain
-	 *  message= string
-	 *  sendcsv= bool
-	 *  use_ssl= bool
-	 *  use_tls= bool
-	 *  pdfview= string
-	 *  papersize= letter|legal|ledger|a2|a3|a4|a5
-	 *  paperorientation= portrait|landscape
-	 *  maxinputs= int
-	 *  maxtime= int m|s|h|d
-	 *  footer= string
-	 * 
- */ - @Override - public Node visitSendemailTransformation(DPLParser.SendemailTransformationContext ctx) { - Node rv = sendemailTransformationEmitCatalyst(ctx); - return rv; - } - - /** - * Sets the parameters based on the given command, and processes the email using the - * {@link SendemailResultsProcessor} - * @param ctx - * @return - */ - private Node sendemailTransformationEmitCatalyst(DPLParser.SendemailTransformationContext ctx) { - - // Parameters from given sendemail command - String toEmails = null, fromEmail = "teragrep@localhost.localdomain", ccEmails = null, bccEmails = null; - String subject = null, customMessageContent = null, customFooterContent = null; - boolean graceful = false; - int priority = 3; // 5-lowest 4-low 3-normal 2-high 1-highest - - boolean sendResults = false; // default false - boolean inline = false; // default false - String inlineFormat = "table"; // default "table", can be "raw" or "csv" - String content_type = "html"; // default "html", can be "plain" - boolean sendCsv = false, sendPdf = false; - String pdfView = null; - - String paperSize = "letter"; // for pdfs; default "letter", can be "legal", "ledger" or from "a2" to "a5" - String paperOrientation = "portrait"; // default "portrait", can be "landscape" - - String server = "localhost"; - int port = 25; - boolean use_tls = false, use_ssl = false; // default false - - boolean width_sort_columns = true; // only valid for contentType="plain" - - int maxInputs = 50000; // default 50k - String maxTime = ""; - - boolean restrictedMode = false; // issue #231 restricted mode - - boolean smtpDebug = false; // enable smtp debugging - - this.sendemailStep = new SendemailStep(); - - // Go through zeppelin config here, so parameters given in command can overwrite these if needed - // credentials for smtp server - String username = ""; - String password = ""; - - // get smtp username and password from zeppelin config - Config zplnConfig = catCtx.getConfig(); - - final String usernameCfgItem = "dpl.smtp.username"; - final String passwordCfgItem = "dpl.smtp.password"; - final String serverCfgItem = "dpl.smtp.server"; - final String restrictedModeCfgItem = "dpl.pth_10.transform.sendemail.restrictedMode"; // issue #231 restricted mode - final String globalFromParameterCfgItem = "dpl.pth_10.transform.sendemail.parameter.from"; // issue #232 global config for 'from' parameter - final String smtpDebugParameterCfgItem = "dpl.smtp.debug"; // config for smtp debug mode - final String smtpEncryptionParameterCfgItem = "dpl.smtp.encryption"; // config for smtp encryption mode (PLAIN, SSL or TLS) - - if (zplnConfig != null && zplnConfig.hasPath(usernameCfgItem)) { - username = zplnConfig.getString(usernameCfgItem); - LOGGER.debug("Sendemail config: username=<[{}]>", username); - } - - if (zplnConfig != null && zplnConfig.hasPath(passwordCfgItem)) { - password = zplnConfig.getString(passwordCfgItem); - LOGGER.debug("Sendemail config: password=<[{}]>", (password != null ? "***" : "null")); - } - - if (zplnConfig != null && zplnConfig.hasPath(smtpEncryptionParameterCfgItem)) { - String value = zplnConfig.getString(smtpEncryptionParameterCfgItem); - - if (value.equals("PLAIN")) { - use_ssl = false; - use_tls = false; - } - else if (value.equals("SSL")) { - use_ssl = true; - use_tls = false; - } - else if (value.equals("TLS")) { - use_ssl = false; - use_tls = true; - } - else { - throw new IllegalArgumentException("Invalid value for '" + smtpEncryptionParameterCfgItem + "'. It must be 'PLAIN', 'SSL' or 'TLS' instead of '" + value + "'"); - } - } - - // replace server given in server parameter if present in zeppelin config - if (zplnConfig != null && zplnConfig.hasPath(serverCfgItem)) { - String serverString = zplnConfig.getString(serverCfgItem); - - String[] hostAndPort = serverString.split(":"); - // more than one item, means port must be present - if (hostAndPort.length > 1) { - server = hostAndPort[0]; - port = Integer.parseInt(hostAndPort[1]); - LOGGER.debug("Sendemail config: server host=<[{}]> port=<[{}]>", server, port); - } - // One item (or less), just server - else { - server = hostAndPort[0]; - LOGGER.debug("Sendemail config: server host=<[{}]> (default port <{}>)",server,port); - } - } - - // issue #231 and #232 parameters from zeppelin config (restrictedMode and fromEmail) - if (zplnConfig != null && zplnConfig.hasPath(restrictedModeCfgItem)) { - restrictedMode = zplnConfig.getBoolean(restrictedModeCfgItem); - LOGGER.debug("Sendemail config: Restricted config=<[{}]>", restrictedMode); - } - - if (zplnConfig != null && zplnConfig.hasPath(globalFromParameterCfgItem)) { - fromEmail = zplnConfig.getString(globalFromParameterCfgItem); - LOGGER.debug("Sendemail config: Global from parameter=<[{}]>", fromEmail); - } - - if (zplnConfig != null && zplnConfig.hasPath(smtpDebugParameterCfgItem)) { - smtpDebug = zplnConfig.getBoolean(smtpDebugParameterCfgItem); - LOGGER.debug("Sendemail config: SMTP Debug parameter=<[{}]>", smtpDebug); - } - - boolean hasForbiddenConfig = false; // set to true if other parameters than email subject and to is found - - // Go through all the parameters given in the command using the for loop below. - for (int i = 0; i < ctx.getChildCount(); i++) { - ParseTree child = ctx.getChild(i); - - if (child instanceof TerminalNode) { - // COMMAND_MODE_SENDEMAIL - // skip keyword "sendemail" - continue; - } - else if (child instanceof DPLParser.T_sendemail_toOptionParameterContext) { - // t_sendemail_toOptionParameter - toEmails = ((StringNode) visit(child)).toString(); - } - else if (child instanceof DPLParser.T_sendemail_fromOptionParameterContext) { - // t_sendemail_fromOptionParameter - fromEmail = ((StringNode) visit(child)).toString(); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_ccOptionParameterContext) { - // t_sendemail_ccOptionParameter - ccEmails = ((StringNode) visit(child)).toString(); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_bccOptionParameterContext) { - // t_sendemail_bccOptionParameter - bccEmails = ((StringNode) visit(child)).toString(); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_subjectOptionParameterContext) { - // t_sendemail_subjectOptionParameter - subject = ((StringNode) visit(child)).toString(); - } - else if (child instanceof DPLParser.T_sendemail_messageOptionParameterContext) { - // T_sendemail_messageOptionParameter - customMessageContent = ((StringNode) visit(child)).toString(); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_footerOptionParameterContext) { - // T_sendemail_footerOptionParameter - customFooterContent = ((StringNode) visit(child)).toString(); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_sendresultsOptionParameterContext) { - // T_sendemail_sendresultsOptionParameter - sendResults = ((StringNode) visit(child)).toString() == "true"; - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_inlineOptionParameterContext) { - // t_sendemail_inlineOptionParameter - inline = ((StringNode) visit(child)).toString() == "true"; - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_formatOptionParameterContext) { - // t_sendemail_formatOptionParameter - inlineFormat = ((StringNode) visit(child)).toString(); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_sendcsvOptionParameterContext) { - // t_sendemail_sendcsvOptionParameter - sendCsv = ((StringNode) visit(child)).toString() == "true"; - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_sendpdfOptionParameterContext) { - // t_sendemail_sendpdfOptionParameter - sendPdf = ((StringNode) visit(child)).toString() == "true"; - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_pdfviewOptionParameterContext) { - // t_sendemail_pdfviewOptionParameter - throw new UnsupportedOperationException("Sendemail does not support 'pdfview' parameter yet."); - } - else if (child instanceof DPLParser.T_sendemail_paperorientationOptionParameterContext) { - // t_sendemail_paperorientationOptionParameter - paperOrientation = ((StringNode) visit(child)).toString(); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_papersizeOptionParameterContext) { - // t_sendemail_papersizeOptionParameter - paperSize = ((StringNode) visit(child)).toString(); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_priorityOptionParameterContext) { - // t_sendemail_priorityOptionParameter - throw new UnsupportedOperationException("Sendemail does not support 'priority' parameter yet."); - } - else if (child instanceof DPLParser.T_sendemail_serverOptionParameterContext) { - // t_sendemail_serverOptionParameter - // split : - // if is missing, use default - String serverString = ((StringNode) visit(child)).toString(); - - LOGGER.debug("server string (should be host:port) = <[{}]>", serverString); - - String[] hostAndPort = serverString.split(":"); - // more than one item, means port must be present - if (hostAndPort.length > 1) { - server = hostAndPort[0]; - port = Integer.parseInt(hostAndPort[1]); - } - // One item (or less), just server - else { - server = hostAndPort[0]; - } - - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_gracefulParameterContext) { - // t_sendemail_gracefulParameter - graceful = ((StringNode) visit(child)).toString() == "true"; - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_contentTypeOptionParameterContext) { - // T_sendemail_contentTypeOptionParameter - content_type = ((StringNode) visit(child)).toString(); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_widthSortColumnsOptionParameterContext) { - // T_sendemail_widthSortColumnsOptionParameter - throw new UnsupportedOperationException("Sendemail does not support 'width_sort_columns' parameter yet."); - //widthSortColumns = ((StringNode) visit(child)).toString() == "true"; - } - else if (child instanceof DPLParser.T_sendemail_useSslOptionParameterContext) { - // T_sendemail_useSslOptionParameter - use_ssl = ((StringNode) visit(child)).toString() == "true"; - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_useTlsOptionParameterContext) { - // T_sendemail_useTlsOptionParameter - use_tls = ((StringNode) visit(child)).toString() == "true"; - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_maxinputsParameterContext) { - // T_sendemail_maxinputsParameter - maxInputs = Integer.parseInt(((StringNode) visit(child)).toString()); - hasForbiddenConfig = true; - } - else if (child instanceof DPLParser.T_sendemail_maxtimeParameterContext) { - // t_sendemail_maxtimeParameter - // m | s | h | d - - throw new UnsupportedOperationException("Sendemail does not support 'maxtime' parameter yet."); - /*String maxTimeString = ((StringNode) visit(child)).toString(); - LOGGER.info("max time string= {}", maxTimeString); - Pattern pattern = Pattern.compile("\\d+"); - Matcher matcher = pattern.matcher(maxTimeString); - if (matcher.find()) { - String number = matcher.group(); - int numberAsInt = Integer.parseInt(number); - String timeUnit = maxTimeString.substring(number.length()); - LOGGER.info("max time = " + numberAsInt + " of unit " + timeUnit); - // TODO do something with numberAsInt and timeUnit. - } - else { - throw new RuntimeException("maxtime argument contained an invalid time argument.\nExpected: m | s | h | d\nGot: " + maxTimeString); - }*/ - } - } - - if (restrictedMode && hasForbiddenConfig) { - throw new IllegalArgumentException("Forbidden configuration detected. Please make sure that only the 'to' and 'subject' parameters are used, or switch off restricted mode."); - } - - if (use_tls && use_ssl) { - throw new IllegalArgumentException("Both 'use_tls' and 'use_ssl' cannot be used simultaneously. Please enable either 'use_tls' or 'use_ssl', not both."); - } - - // initialize results processor - final SendemailResultsProcessor resultsProcessor = new SendemailResultsProcessor(use_tls, server, port, use_ssl, username, password, fromEmail, toEmails, ccEmails, bccEmails, subject, customMessageContent, - inlineFormat, sendResults, inline, sendCsv, sendPdf, customFooterContent, paperSize, paperOrientation, content_type, maxInputs, catCtx.getUrl(), smtpDebug); - - // step - this.sendemailStep.setSendemailResultsProcessor(resultsProcessor); - this.sendemailStep.setSendResults(sendResults); - - return new StepNode(sendemailStep); - } - - - - - // COMMAND_SENDEMAIL_MODE_TO t_sendemail_emailListParameter - @Override - public Node visitT_sendemail_toOptionParameter(DPLParser.T_sendemail_toOptionParameterContext ctx) { - Node rv = null; - - // skip keyword and return email list - rv = visit(ctx.getChild(1)); - - return rv; - } - - // COMMAND_SENDEMAIL_MODE_FROM ... - @Override - public Node visitT_sendemail_fromOptionParameter(DPLParser.T_sendemail_fromOptionParameterContext ctx) { - Node rv = null; - - // skip keyword and return email list - rv = visit(ctx.getChild(1)); - - return rv; - } - - @Override - public Node visitT_sendemail_ccOptionParameter(DPLParser.T_sendemail_ccOptionParameterContext ctx) { - Node rv = null; - - // skip keyword and return email list - rv = visit(ctx.getChild(1)); - - return rv; - } - - @Override - public Node visitT_sendemail_bccOptionParameter(DPLParser.T_sendemail_bccOptionParameterContext ctx) { - Node rv = null; - - // skip keyword and return email list - rv = visit(ctx.getChild(1)); - - return rv; - } - - @Override - public Node visitT_sendemail_subjectOptionParameter(DPLParser.T_sendemail_subjectOptionParameterContext ctx) { - Node rv = null; - - // skip keyword and return subject - rv = new StringNode(new Token(Token.Type.STRING, new UnquotedText(new TextString(ctx.getChild(1).getText())).read())); - - return rv; - } - - @Override - public Node visitT_sendemail_messageOptionParameter(DPLParser.T_sendemail_messageOptionParameterContext ctx) { - Node rv = null; - - // skip keyword and return message - rv = new StringNode(new Token(Token.Type.STRING, new UnquotedText(new TextString(ctx.getChild(1).getText())).read())); - - return rv; - } - - @Override - public Node visitT_sendemail_footerOptionParameter(DPLParser.T_sendemail_footerOptionParameterContext ctx) { - Node rv = null; - - // skip keyword and return footer - rv = new StringNode(new Token(Token.Type.STRING, new UnquotedText(new TextString(ctx.getChild(1).getText())).read())); - - return rv; - } - - @Override - public Node visitT_sendemail_inlineOptionParameter(DPLParser.T_sendemail_inlineOptionParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - String value = null; - - switch (booleanValue.getSymbol().getType()) { - case DPLLexer.GET_BOOLEAN_TRUE: - value = "true"; - break; - case DPLLexer.GET_BOOLEAN_FALSE: - value = "false"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - return rv; - } - - @Override - public Node visitT_sendemail_formatOptionParameter(DPLParser.T_sendemail_formatOptionParameterContext ctx) { - Node rv = null; - - TerminalNode formatValue = (TerminalNode) ctx.getChild(1);//.getChild(0); - String value = null; - - switch (formatValue.getSymbol().getType()) { - case DPLLexer.COMMAND_SENDEMAIL_MODE_FORMAT_MODE_CSV: - value = "csv"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_FORMAT_MODE_TABLE: - value = "table"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_FORMAT_MODE_RAW: - value = "raw"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - - return rv; - } - - @Override - public Node visitT_sendemail_sendcsvOptionParameter(DPLParser.T_sendemail_sendcsvOptionParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - String value = null; - - switch (booleanValue.getSymbol().getType()) { - case DPLLexer.GET_BOOLEAN_TRUE: - value = "true"; - break; - case DPLLexer.GET_BOOLEAN_FALSE: - value = "false"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - - return rv; - } - - @Override - public Node visitT_sendemail_sendpdfOptionParameter(DPLParser.T_sendemail_sendpdfOptionParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - String value = null; - - switch (booleanValue.getSymbol().getType()) { - case DPLLexer.GET_BOOLEAN_TRUE: - value = "true"; - break; - case DPLLexer.GET_BOOLEAN_FALSE: - value = "false"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - - return rv; - } - - @Override - public Node visitT_sendemail_pdfviewOptionParameter(DPLParser.T_sendemail_pdfviewOptionParameterContext ctx) { - Node rv = null; - - rv = new StringNode(new Token(Token.Type.STRING, ctx.getChild(1).getText())); - - return rv; - } - - @Override - public Node visitT_sendemail_sendresultsOptionParameter(DPLParser.T_sendemail_sendresultsOptionParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - String value = null; - - switch (booleanValue.getSymbol().getType()) { - case DPLLexer.GET_BOOLEAN_TRUE: - value = "true"; - break; - case DPLLexer.GET_BOOLEAN_FALSE: - value = "false"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - return rv; - } - - @Override - public Node visitT_sendemail_paperorientationOptionParameter(DPLParser.T_sendemail_paperorientationOptionParameterContext ctx) { - Node rv = null; - - TerminalNode paperOrientationValue = (TerminalNode) ctx.getChild(1); - String value = null; - - switch (paperOrientationValue.getSymbol().getType()) { - case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERORIENTATION_MODE_PORTRAIT: - value = "portrait"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERORIENTATION_MODE_LANDSCAPE: - value = "landscape"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - return rv; - } - - @Override - public Node visitT_sendemail_papersizeOptionParameter(DPLParser.T_sendemail_papersizeOptionParameterContext ctx) { - Node rv = null; - - TerminalNode paperSizeValue = (TerminalNode) ctx.getChild(1); - String value = null; - - switch (paperSizeValue.getSymbol().getType()) { - case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_A2: - value = "a2"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_A3: - value = "a3"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_A4: - value = "a4"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_A5: - value = "a5"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_LEDGER: - value = "ledger"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_LEGAL: - value = "legal"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_LETTER: - value = "letter"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - return rv; - } - - @Override - public Node visitT_sendemail_priorityOptionParameter(DPLParser.T_sendemail_priorityOptionParameterContext ctx) { - Node rv = null; - - TerminalNode priorityValue = (TerminalNode) ctx.getChild(1); - String value = ""; - - switch (priorityValue.getSymbol().getType()) { - case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_LOWEST: - value = "5"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_LOW: - value = "4"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_NORMAL: - value = "3"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_HIGH: - value = "2"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_HIGHEST: - value = "1"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - return rv; - } - - @Override - public Node visitT_sendemail_serverOptionParameter(DPLParser.T_sendemail_serverOptionParameterContext ctx) { - Node rv = null; - - String server = ctx.getChild(1).getText(); - - rv = new StringNode(new Token(Token.Type.STRING, server)); - return rv; - } - - @Override - public Node visitT_sendemail_gracefulParameter(DPLParser.T_sendemail_gracefulParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - String value = null; - - switch (booleanValue.getSymbol().getType()) { - case DPLLexer.GET_BOOLEAN_TRUE: - value = "true"; - break; - case DPLLexer.GET_BOOLEAN_FALSE: - value = "false"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - return rv; - } - - @Override - public Node visitT_sendemail_contentTypeOptionParameter(DPLParser.T_sendemail_contentTypeOptionParameterContext ctx) { - Node rv = null; - - // content_type is html OR plain - - TerminalNode contentTypeValue = (TerminalNode) ctx.getChild(1);//.getChild(0); - String value = null; - - switch (contentTypeValue.getSymbol().getType()) { - case DPLLexer.COMMAND_SENDEMAIL_MODE_CONTENT_TYPE_MODE_HTML: - value = "html"; - break; - case DPLLexer.COMMAND_SENDEMAIL_MODE_CONTENT_TYPE_MODE_PLAIN: - value = "plain"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - - return rv; - } - - @Override - public Node visitT_sendemail_widthSortColumnsOptionParameter(DPLParser.T_sendemail_widthSortColumnsOptionParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - String value = null; - - switch (booleanValue.getSymbol().getType()) { - case DPLLexer.GET_BOOLEAN_TRUE: - value = "true"; - break; - case DPLLexer.GET_BOOLEAN_FALSE: - value = "false"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - - return rv; - } - - @Override - public Node visitT_sendemail_useSslOptionParameter(DPLParser.T_sendemail_useSslOptionParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - String value = null; - - switch (booleanValue.getSymbol().getType()) { - case DPLLexer.GET_BOOLEAN_TRUE: - value = "true"; - break; - case DPLLexer.GET_BOOLEAN_FALSE: - value = "false"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - - - return rv; - } - - @Override - public Node visitT_sendemail_useTlsOptionParameter(DPLParser.T_sendemail_useTlsOptionParameterContext ctx) { - Node rv = null; - - TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); - String value = null; - - switch (booleanValue.getSymbol().getType()) { - case DPLLexer.GET_BOOLEAN_TRUE: - value = "true"; - break; - case DPLLexer.GET_BOOLEAN_FALSE: - value = "false"; - break; - } - - rv = new StringNode(new Token(Token.Type.STRING, value)); - - return rv; - } - - @Override - public Node visitT_sendemail_maxinputsParameter(DPLParser.T_sendemail_maxinputsParameterContext ctx) { - Node rv = null; - - rv = new StringNode(new Token(Token.Type.STRING, ctx.getChild(1).getText())); - - return rv; - } - - @Override - public Node visitT_sendemail_maxtimeParameter(DPLParser.T_sendemail_maxtimeParameterContext ctx) { - Node rv = null; - - rv = new StringNode(new Token(Token.Type.STRING, ctx.getChild(1).getText())); - - return rv; - } - - // stringType (COMMA stringType)*? - @Override - public Node visitT_sendemail_emailListParameter(DPLParser.T_sendemail_emailListParameterContext ctx) { - return new StringNode(new Token(Token.Type.STRING, new UnquotedText(new TextString(ctx.getText())).read())); - } - + + private static final Logger LOGGER = LoggerFactory.getLogger(SendemailTransformation.class); + private DPLParserCatalystContext catCtx = null; + public SendemailStep sendemailStep = null; + + public SendemailTransformation(DPLParserCatalystContext catCtx) { + this.catCtx = catCtx; + } + + /** + *
+     * Command info:
+     * Generates email notifications. Email search results to specified email addresses.
+     * SMTP server needs to be available to send email.
+     * 
+     * Syntax:
+     * sendemail
+     * 	to= list of emails [REQUIRED, other params optional]
+     *  cc= list of emails
+     *  bcc= list of emails
+     *  subject= string
+     *  format= csv|table|raw
+     *  inline= bool
+     *  sendresults= bool
+     *  sendpdf= bool
+     *  priority= highest|high|normal|low|lowest
+     *  server= string
+     *  width_sort_columns= bool
+     *  graceful= bool
+     *  content_type= html | plain
+     *  message= string
+     *  sendcsv= bool
+     *  use_ssl= bool
+     *  use_tls= bool
+     *  pdfview= string
+     *  papersize= letter|legal|ledger|a2|a3|a4|a5
+     *  paperorientation= portrait|landscape
+     *  maxinputs= int
+     *  maxtime= int m|s|h|d
+     *  footer= string
+     * 
+ */ + @Override + public Node visitSendemailTransformation(DPLParser.SendemailTransformationContext ctx) { + Node rv = sendemailTransformationEmitCatalyst(ctx); + return rv; + } + + /** + * Sets the parameters based on the given command, and processes the email using the + * {@link SendemailResultsProcessor} + * + * @param ctx + * @return + */ + private Node sendemailTransformationEmitCatalyst(DPLParser.SendemailTransformationContext ctx) { + + // Parameters from given sendemail command + String toEmails = null, fromEmail = "teragrep@localhost.localdomain", ccEmails = null, bccEmails = null; + String subject = null, customMessageContent = null, customFooterContent = null; + boolean graceful = false; + int priority = 3; // 5-lowest 4-low 3-normal 2-high 1-highest + + boolean sendResults = false; // default false + boolean inline = false; // default false + String inlineFormat = "table"; // default "table", can be "raw" or "csv" + String content_type = "html"; // default "html", can be "plain" + boolean sendCsv = false, sendPdf = false; + String pdfView = null; + + String paperSize = "letter"; // for pdfs; default "letter", can be "legal", "ledger" or from "a2" to "a5" + String paperOrientation = "portrait"; // default "portrait", can be "landscape" + + String server = "localhost"; + int port = 25; + boolean use_tls = false, use_ssl = false; // default false + + boolean width_sort_columns = true; // only valid for contentType="plain" + + int maxInputs = 50000; // default 50k + String maxTime = ""; + + boolean restrictedMode = false; // issue #231 restricted mode + + boolean smtpDebug = false; // enable smtp debugging + + this.sendemailStep = new SendemailStep(); + + // Go through zeppelin config here, so parameters given in command can overwrite these if needed + // credentials for smtp server + String username = ""; + String password = ""; + + // get smtp username and password from zeppelin config + Config zplnConfig = catCtx.getConfig(); + + final String usernameCfgItem = "dpl.smtp.username"; + final String passwordCfgItem = "dpl.smtp.password"; + final String serverCfgItem = "dpl.smtp.server"; + final String restrictedModeCfgItem = "dpl.pth_10.transform.sendemail.restrictedMode"; // issue #231 restricted mode + final String globalFromParameterCfgItem = "dpl.pth_10.transform.sendemail.parameter.from"; // issue #232 global config for 'from' parameter + final String smtpDebugParameterCfgItem = "dpl.smtp.debug"; // config for smtp debug mode + final String smtpEncryptionParameterCfgItem = "dpl.smtp.encryption"; // config for smtp encryption mode (PLAIN, SSL or TLS) + + if (zplnConfig != null && zplnConfig.hasPath(usernameCfgItem)) { + username = zplnConfig.getString(usernameCfgItem); + LOGGER.debug("Sendemail config: username=<[{}]>", username); + } + + if (zplnConfig != null && zplnConfig.hasPath(passwordCfgItem)) { + password = zplnConfig.getString(passwordCfgItem); + LOGGER.debug("Sendemail config: password=<[{}]>", (password != null ? "***" : "null")); + } + + if (zplnConfig != null && zplnConfig.hasPath(smtpEncryptionParameterCfgItem)) { + String value = zplnConfig.getString(smtpEncryptionParameterCfgItem); + + if (value.equals("PLAIN")) { + use_ssl = false; + use_tls = false; + } + else if (value.equals("SSL")) { + use_ssl = true; + use_tls = false; + } + else if (value.equals("TLS")) { + use_ssl = false; + use_tls = true; + } + else { + throw new IllegalArgumentException( + "Invalid value for '" + smtpEncryptionParameterCfgItem + + "'. It must be 'PLAIN', 'SSL' or 'TLS' instead of '" + value + "'" + ); + } + } + + // replace server given in server parameter if present in zeppelin config + if (zplnConfig != null && zplnConfig.hasPath(serverCfgItem)) { + String serverString = zplnConfig.getString(serverCfgItem); + + String[] hostAndPort = serverString.split(":"); + // more than one item, means port must be present + if (hostAndPort.length > 1) { + server = hostAndPort[0]; + port = Integer.parseInt(hostAndPort[1]); + LOGGER.debug("Sendemail config: server host=<[{}]> port=<[{}]>", server, port); + } + // One item (or less), just server + else { + server = hostAndPort[0]; + LOGGER.debug("Sendemail config: server host=<[{}]> (default port <{}>)", server, port); + } + } + + // issue #231 and #232 parameters from zeppelin config (restrictedMode and fromEmail) + if (zplnConfig != null && zplnConfig.hasPath(restrictedModeCfgItem)) { + restrictedMode = zplnConfig.getBoolean(restrictedModeCfgItem); + LOGGER.debug("Sendemail config: Restricted config=<[{}]>", restrictedMode); + } + + if (zplnConfig != null && zplnConfig.hasPath(globalFromParameterCfgItem)) { + fromEmail = zplnConfig.getString(globalFromParameterCfgItem); + LOGGER.debug("Sendemail config: Global from parameter=<[{}]>", fromEmail); + } + + if (zplnConfig != null && zplnConfig.hasPath(smtpDebugParameterCfgItem)) { + smtpDebug = zplnConfig.getBoolean(smtpDebugParameterCfgItem); + LOGGER.debug("Sendemail config: SMTP Debug parameter=<[{}]>", smtpDebug); + } + + boolean hasForbiddenConfig = false; // set to true if other parameters than email subject and to is found + + // Go through all the parameters given in the command using the for loop below. + for (int i = 0; i < ctx.getChildCount(); i++) { + ParseTree child = ctx.getChild(i); + + if (child instanceof TerminalNode) { + // COMMAND_MODE_SENDEMAIL + // skip keyword "sendemail" + continue; + } + else if (child instanceof DPLParser.T_sendemail_toOptionParameterContext) { + // t_sendemail_toOptionParameter + toEmails = ((StringNode) visit(child)).toString(); + } + else if (child instanceof DPLParser.T_sendemail_fromOptionParameterContext) { + // t_sendemail_fromOptionParameter + fromEmail = ((StringNode) visit(child)).toString(); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_ccOptionParameterContext) { + // t_sendemail_ccOptionParameter + ccEmails = ((StringNode) visit(child)).toString(); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_bccOptionParameterContext) { + // t_sendemail_bccOptionParameter + bccEmails = ((StringNode) visit(child)).toString(); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_subjectOptionParameterContext) { + // t_sendemail_subjectOptionParameter + subject = ((StringNode) visit(child)).toString(); + } + else if (child instanceof DPLParser.T_sendemail_messageOptionParameterContext) { + // T_sendemail_messageOptionParameter + customMessageContent = ((StringNode) visit(child)).toString(); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_footerOptionParameterContext) { + // T_sendemail_footerOptionParameter + customFooterContent = ((StringNode) visit(child)).toString(); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_sendresultsOptionParameterContext) { + // T_sendemail_sendresultsOptionParameter + sendResults = ((StringNode) visit(child)).toString() == "true"; + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_inlineOptionParameterContext) { + // t_sendemail_inlineOptionParameter + inline = ((StringNode) visit(child)).toString() == "true"; + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_formatOptionParameterContext) { + // t_sendemail_formatOptionParameter + inlineFormat = ((StringNode) visit(child)).toString(); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_sendcsvOptionParameterContext) { + // t_sendemail_sendcsvOptionParameter + sendCsv = ((StringNode) visit(child)).toString() == "true"; + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_sendpdfOptionParameterContext) { + // t_sendemail_sendpdfOptionParameter + sendPdf = ((StringNode) visit(child)).toString() == "true"; + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_pdfviewOptionParameterContext) { + // t_sendemail_pdfviewOptionParameter + throw new UnsupportedOperationException("Sendemail does not support 'pdfview' parameter yet."); + } + else if (child instanceof DPLParser.T_sendemail_paperorientationOptionParameterContext) { + // t_sendemail_paperorientationOptionParameter + paperOrientation = ((StringNode) visit(child)).toString(); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_papersizeOptionParameterContext) { + // t_sendemail_papersizeOptionParameter + paperSize = ((StringNode) visit(child)).toString(); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_priorityOptionParameterContext) { + // t_sendemail_priorityOptionParameter + throw new UnsupportedOperationException("Sendemail does not support 'priority' parameter yet."); + } + else if (child instanceof DPLParser.T_sendemail_serverOptionParameterContext) { + // t_sendemail_serverOptionParameter + // split : + // if is missing, use default + String serverString = ((StringNode) visit(child)).toString(); + + LOGGER.debug("server string (should be host:port) = <[{}]>", serverString); + + String[] hostAndPort = serverString.split(":"); + // more than one item, means port must be present + if (hostAndPort.length > 1) { + server = hostAndPort[0]; + port = Integer.parseInt(hostAndPort[1]); + } + // One item (or less), just server + else { + server = hostAndPort[0]; + } + + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_gracefulParameterContext) { + // t_sendemail_gracefulParameter + graceful = ((StringNode) visit(child)).toString() == "true"; + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_contentTypeOptionParameterContext) { + // T_sendemail_contentTypeOptionParameter + content_type = ((StringNode) visit(child)).toString(); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_widthSortColumnsOptionParameterContext) { + // T_sendemail_widthSortColumnsOptionParameter + throw new UnsupportedOperationException( + "Sendemail does not support 'width_sort_columns' parameter yet." + ); + //widthSortColumns = ((StringNode) visit(child)).toString() == "true"; + } + else if (child instanceof DPLParser.T_sendemail_useSslOptionParameterContext) { + // T_sendemail_useSslOptionParameter + use_ssl = ((StringNode) visit(child)).toString() == "true"; + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_useTlsOptionParameterContext) { + // T_sendemail_useTlsOptionParameter + use_tls = ((StringNode) visit(child)).toString() == "true"; + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_maxinputsParameterContext) { + // T_sendemail_maxinputsParameter + maxInputs = Integer.parseInt(((StringNode) visit(child)).toString()); + hasForbiddenConfig = true; + } + else if (child instanceof DPLParser.T_sendemail_maxtimeParameterContext) { + // t_sendemail_maxtimeParameter + // m | s | h | d + + throw new UnsupportedOperationException("Sendemail does not support 'maxtime' parameter yet."); + /*String maxTimeString = ((StringNode) visit(child)).toString(); + LOGGER.info("max time string= {}", maxTimeString); + Pattern pattern = Pattern.compile("\\d+"); + Matcher matcher = pattern.matcher(maxTimeString); + if (matcher.find()) { + String number = matcher.group(); + int numberAsInt = Integer.parseInt(number); + String timeUnit = maxTimeString.substring(number.length()); + LOGGER.info("max time = " + numberAsInt + " of unit " + timeUnit); + // TODO do something with numberAsInt and timeUnit. + } + else { + throw new RuntimeException("maxtime argument contained an invalid time argument.\nExpected: m | s | h | d\nGot: " + maxTimeString); + }*/ + } + } + + if (restrictedMode && hasForbiddenConfig) { + throw new IllegalArgumentException( + "Forbidden configuration detected. Please make sure that only the 'to' and 'subject' parameters are used, or switch off restricted mode." + ); + } + + if (use_tls && use_ssl) { + throw new IllegalArgumentException( + "Both 'use_tls' and 'use_ssl' cannot be used simultaneously. Please enable either 'use_tls' or 'use_ssl', not both." + ); + } + + // initialize results processor + final SendemailResultsProcessor resultsProcessor = new SendemailResultsProcessor( + use_tls, + server, + port, + use_ssl, + username, + password, + fromEmail, + toEmails, + ccEmails, + bccEmails, + subject, + customMessageContent, + inlineFormat, + sendResults, + inline, + sendCsv, + sendPdf, + customFooterContent, + paperSize, + paperOrientation, + content_type, + maxInputs, + catCtx.getUrl(), + smtpDebug + ); + + // step + this.sendemailStep.setSendemailResultsProcessor(resultsProcessor); + this.sendemailStep.setSendResults(sendResults); + + return new StepNode(sendemailStep); + } + + // COMMAND_SENDEMAIL_MODE_TO t_sendemail_emailListParameter + @Override + public Node visitT_sendemail_toOptionParameter(DPLParser.T_sendemail_toOptionParameterContext ctx) { + Node rv = null; + + // skip keyword and return email list + rv = visit(ctx.getChild(1)); + + return rv; + } + + // COMMAND_SENDEMAIL_MODE_FROM ... + @Override + public Node visitT_sendemail_fromOptionParameter(DPLParser.T_sendemail_fromOptionParameterContext ctx) { + Node rv = null; + + // skip keyword and return email list + rv = visit(ctx.getChild(1)); + + return rv; + } + + @Override + public Node visitT_sendemail_ccOptionParameter(DPLParser.T_sendemail_ccOptionParameterContext ctx) { + Node rv = null; + + // skip keyword and return email list + rv = visit(ctx.getChild(1)); + + return rv; + } + + @Override + public Node visitT_sendemail_bccOptionParameter(DPLParser.T_sendemail_bccOptionParameterContext ctx) { + Node rv = null; + + // skip keyword and return email list + rv = visit(ctx.getChild(1)); + + return rv; + } + + @Override + public Node visitT_sendemail_subjectOptionParameter(DPLParser.T_sendemail_subjectOptionParameterContext ctx) { + Node rv = null; + + // skip keyword and return subject + rv = new StringNode( + new Token(Token.Type.STRING, new UnquotedText(new TextString(ctx.getChild(1).getText())).read()) + ); + + return rv; + } + + @Override + public Node visitT_sendemail_messageOptionParameter(DPLParser.T_sendemail_messageOptionParameterContext ctx) { + Node rv = null; + + // skip keyword and return message + rv = new StringNode( + new Token(Token.Type.STRING, new UnquotedText(new TextString(ctx.getChild(1).getText())).read()) + ); + + return rv; + } + + @Override + public Node visitT_sendemail_footerOptionParameter(DPLParser.T_sendemail_footerOptionParameterContext ctx) { + Node rv = null; + + // skip keyword and return footer + rv = new StringNode( + new Token(Token.Type.STRING, new UnquotedText(new TextString(ctx.getChild(1).getText())).read()) + ); + + return rv; + } + + @Override + public Node visitT_sendemail_inlineOptionParameter(DPLParser.T_sendemail_inlineOptionParameterContext ctx) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + String value = null; + + switch (booleanValue.getSymbol().getType()) { + case DPLLexer.GET_BOOLEAN_TRUE: + value = "true"; + break; + case DPLLexer.GET_BOOLEAN_FALSE: + value = "false"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + return rv; + } + + @Override + public Node visitT_sendemail_formatOptionParameter(DPLParser.T_sendemail_formatOptionParameterContext ctx) { + Node rv = null; + + TerminalNode formatValue = (TerminalNode) ctx.getChild(1);//.getChild(0); + String value = null; + + switch (formatValue.getSymbol().getType()) { + case DPLLexer.COMMAND_SENDEMAIL_MODE_FORMAT_MODE_CSV: + value = "csv"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_FORMAT_MODE_TABLE: + value = "table"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_FORMAT_MODE_RAW: + value = "raw"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + + return rv; + } + + @Override + public Node visitT_sendemail_sendcsvOptionParameter(DPLParser.T_sendemail_sendcsvOptionParameterContext ctx) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + String value = null; + + switch (booleanValue.getSymbol().getType()) { + case DPLLexer.GET_BOOLEAN_TRUE: + value = "true"; + break; + case DPLLexer.GET_BOOLEAN_FALSE: + value = "false"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + + return rv; + } + + @Override + public Node visitT_sendemail_sendpdfOptionParameter(DPLParser.T_sendemail_sendpdfOptionParameterContext ctx) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + String value = null; + + switch (booleanValue.getSymbol().getType()) { + case DPLLexer.GET_BOOLEAN_TRUE: + value = "true"; + break; + case DPLLexer.GET_BOOLEAN_FALSE: + value = "false"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + + return rv; + } + + @Override + public Node visitT_sendemail_pdfviewOptionParameter(DPLParser.T_sendemail_pdfviewOptionParameterContext ctx) { + Node rv = null; + + rv = new StringNode(new Token(Token.Type.STRING, ctx.getChild(1).getText())); + + return rv; + } + + @Override + public Node visitT_sendemail_sendresultsOptionParameter( + DPLParser.T_sendemail_sendresultsOptionParameterContext ctx + ) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + String value = null; + + switch (booleanValue.getSymbol().getType()) { + case DPLLexer.GET_BOOLEAN_TRUE: + value = "true"; + break; + case DPLLexer.GET_BOOLEAN_FALSE: + value = "false"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + return rv; + } + + @Override + public Node visitT_sendemail_paperorientationOptionParameter( + DPLParser.T_sendemail_paperorientationOptionParameterContext ctx + ) { + Node rv = null; + + TerminalNode paperOrientationValue = (TerminalNode) ctx.getChild(1); + String value = null; + + switch (paperOrientationValue.getSymbol().getType()) { + case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERORIENTATION_MODE_PORTRAIT: + value = "portrait"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERORIENTATION_MODE_LANDSCAPE: + value = "landscape"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + return rv; + } + + @Override + public Node visitT_sendemail_papersizeOptionParameter(DPLParser.T_sendemail_papersizeOptionParameterContext ctx) { + Node rv = null; + + TerminalNode paperSizeValue = (TerminalNode) ctx.getChild(1); + String value = null; + + switch (paperSizeValue.getSymbol().getType()) { + case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_A2: + value = "a2"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_A3: + value = "a3"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_A4: + value = "a4"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_A5: + value = "a5"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_LEDGER: + value = "ledger"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_LEGAL: + value = "legal"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PAPERSIZE_MODE_LETTER: + value = "letter"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + return rv; + } + + @Override + public Node visitT_sendemail_priorityOptionParameter(DPLParser.T_sendemail_priorityOptionParameterContext ctx) { + Node rv = null; + + TerminalNode priorityValue = (TerminalNode) ctx.getChild(1); + String value = ""; + + switch (priorityValue.getSymbol().getType()) { + case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_LOWEST: + value = "5"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_LOW: + value = "4"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_NORMAL: + value = "3"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_HIGH: + value = "2"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_PRIORITY_MODE_HIGHEST: + value = "1"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + return rv; + } + + @Override + public Node visitT_sendemail_serverOptionParameter(DPLParser.T_sendemail_serverOptionParameterContext ctx) { + Node rv = null; + + String server = ctx.getChild(1).getText(); + + rv = new StringNode(new Token(Token.Type.STRING, server)); + return rv; + } + + @Override + public Node visitT_sendemail_gracefulParameter(DPLParser.T_sendemail_gracefulParameterContext ctx) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + String value = null; + + switch (booleanValue.getSymbol().getType()) { + case DPLLexer.GET_BOOLEAN_TRUE: + value = "true"; + break; + case DPLLexer.GET_BOOLEAN_FALSE: + value = "false"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + return rv; + } + + @Override + public Node visitT_sendemail_contentTypeOptionParameter( + DPLParser.T_sendemail_contentTypeOptionParameterContext ctx + ) { + Node rv = null; + + // content_type is html OR plain + + TerminalNode contentTypeValue = (TerminalNode) ctx.getChild(1);//.getChild(0); + String value = null; + + switch (contentTypeValue.getSymbol().getType()) { + case DPLLexer.COMMAND_SENDEMAIL_MODE_CONTENT_TYPE_MODE_HTML: + value = "html"; + break; + case DPLLexer.COMMAND_SENDEMAIL_MODE_CONTENT_TYPE_MODE_PLAIN: + value = "plain"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + + return rv; + } + + @Override + public Node visitT_sendemail_widthSortColumnsOptionParameter( + DPLParser.T_sendemail_widthSortColumnsOptionParameterContext ctx + ) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + String value = null; + + switch (booleanValue.getSymbol().getType()) { + case DPLLexer.GET_BOOLEAN_TRUE: + value = "true"; + break; + case DPLLexer.GET_BOOLEAN_FALSE: + value = "false"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + + return rv; + } + + @Override + public Node visitT_sendemail_useSslOptionParameter(DPLParser.T_sendemail_useSslOptionParameterContext ctx) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + String value = null; + + switch (booleanValue.getSymbol().getType()) { + case DPLLexer.GET_BOOLEAN_TRUE: + value = "true"; + break; + case DPLLexer.GET_BOOLEAN_FALSE: + value = "false"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + + return rv; + } + + @Override + public Node visitT_sendemail_useTlsOptionParameter(DPLParser.T_sendemail_useTlsOptionParameterContext ctx) { + Node rv = null; + + TerminalNode booleanValue = (TerminalNode) ctx.getChild(1).getChild(0); + String value = null; + + switch (booleanValue.getSymbol().getType()) { + case DPLLexer.GET_BOOLEAN_TRUE: + value = "true"; + break; + case DPLLexer.GET_BOOLEAN_FALSE: + value = "false"; + break; + } + + rv = new StringNode(new Token(Token.Type.STRING, value)); + + return rv; + } + + @Override + public Node visitT_sendemail_maxinputsParameter(DPLParser.T_sendemail_maxinputsParameterContext ctx) { + Node rv = null; + + rv = new StringNode(new Token(Token.Type.STRING, ctx.getChild(1).getText())); + + return rv; + } + + @Override + public Node visitT_sendemail_maxtimeParameter(DPLParser.T_sendemail_maxtimeParameterContext ctx) { + Node rv = null; + + rv = new StringNode(new Token(Token.Type.STRING, ctx.getChild(1).getText())); + + return rv; + } + + // stringType (COMMA stringType)*? + @Override + public Node visitT_sendemail_emailListParameter(DPLParser.T_sendemail_emailListParameterContext ctx) { + return new StringNode(new Token(Token.Type.STRING, new UnquotedText(new TextString(ctx.getText())).read())); + } + } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SortTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SortTransformation.java index d1d0571..15936f3 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SortTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SortTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.functions.dpf_02.SortByClause; @@ -70,6 +69,7 @@ * Processes the arguments and provides it for dpf_02 (BatchCollect) for sorting purposes. */ public class SortTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(SortTransformation.class); private final DPLParserCatalystContext catCtx; private final DPLParserCatalystVisitor catVisitor; @@ -89,9 +89,9 @@ public SortTransformation(DPLParserCatalystContext catCtx, DPLParserCatalystVisi } /** - * Sets the variable values based on the parameters given in the command, - * and builds the SortByClauses to be used by the dpf_02 BatchCollect, - * as the actual sorting happens in BatchCollect + * Sets the variable values based on the parameters given in the command, and builds the SortByClauses to be used by + * the dpf_02 BatchCollect, as the actual sorting happens in BatchCollect + * * @param ctx SortTransformationContext * @return CatalystNode containing result set (same as input) */ @@ -150,14 +150,23 @@ public Node visitSortTransformation(DPLParser.SortTransformationContext ctx) { // at least one sortByClause needs to be present // otherwise throw exception with details if (this.listOfSortByClauses.size() < 1) { - throw new IllegalArgumentException("Sort command should contain at least one sortByInstruction. Example: 'sort -_time'"); + throw new IllegalArgumentException( + "Sort command should contain at least one sortByInstruction. Example: 'sort -_time'" + ); } this.sortStep = new SortStep(catCtx, listOfSortByClauses, limit, desc); - LOGGER.info(String.format("Set sortStep params to: sbc=%s, desc=%s, bc=%s, limit=%s", - Arrays.toString(this.sortStep.getListOfSortByClauses().toArray()), this.sortStep.isDesc(), this.sortStep.getSortingBatchCollect(), - this.sortStep.getLimit())); + LOGGER + .info( + String + .format( + "Set sortStep params to: sbc=%s, desc=%s, bc=%s, limit=%s", Arrays + .toString(this.sortStep.getListOfSortByClauses().toArray()), + this.sortStep.isDesc(), this.sortStep.getSortingBatchCollect(), + this.sortStep.getLimit() + ) + ); return sortTransformationEmitCatalyst(ctx); } @@ -191,7 +200,6 @@ public Node visitT_sort_sortByClauseInstruction(DPLParser.T_sort_sortByClauseIns } } - // PLUS (+) is descending=false // MINUS (-) is descending=true @Override @@ -291,8 +299,8 @@ public Node visitT_sort_byMethodNum(DPLParser.T_sort_byMethodNumContext ctx) { } /** - * Pushes the list of sortBy clauses to ProcessingStack, - * and pushes the dataset to the stack. + * Pushes the list of sortBy clauses to ProcessingStack, and pushes the dataset to the stack. + * * @param ctx * @return */ diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SpathTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SpathTransformation.java index 055c41f..3f8f473 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SpathTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/SpathTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -56,20 +55,16 @@ /** * Base transformation class for the command spath
- * Allows the user to extract data from JSON or XML data formats - * using an spath / xpath expression. - * - *
spath input=... output=... path=...|...
- * Defaults: - *
spath input=_raw output=path path=...
- * Path omitted -> auto-extract mode: extracts all fields from the first 5000 characters in - * the input field. + * Allows the user to extract data from JSON or XML data formats using an spath / xpath expression. + *
spath input=... output=... path=...|...
Defaults:
spath input=_raw output=path path=...
Path + * omitted -> auto-extract mode: extracts all fields from the first 5000 characters in the input field. */ -public class SpathTransformation extends DPLParserBaseVisitor -{ +public class SpathTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(SpathTransformation.class); private final DPLParserCatalystContext catCtx; public SpathStep spathStep = null; + public SpathTransformation(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/StatsTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/StatsTransformation.java index d8b5cbe..0c82fc6 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/StatsTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/StatsTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.functions.dpf_02.SortByClause; @@ -72,147 +71,157 @@ * Base transformation class for the stats command */ public class StatsTransformation extends DPLParserBaseVisitor { - private static final Logger LOGGER = LoggerFactory.getLogger(StatsTransformation.class); - List queueOfAggregates = new ArrayList<>(); // contains all aggregate ColumnNodes - - final List byFields = new ArrayList<>(); - final List listOfByFields = new ArrayList<>(); // seq of fields to be used for groupBy - private final List listOfSbc = new ArrayList<>(); - - public StatsStep statsStep; - private final DPLParserCatalystContext catCtx; - - public StatsTransformation(DPLParserCatalystContext catCtx) { + + private static final Logger LOGGER = LoggerFactory.getLogger(StatsTransformation.class); + List queueOfAggregates = new ArrayList<>(); // contains all aggregate ColumnNodes + + final List byFields = new ArrayList<>(); + final List listOfByFields = new ArrayList<>(); // seq of fields to be used for groupBy + private final List listOfSbc = new ArrayList<>(); + + public StatsStep statsStep; + private final DPLParserCatalystContext catCtx; + + public StatsTransformation(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; - } - - /* - * Command info: - * Performs the AggregateFunction with given fieldRenameInstruction and byInstruction - * Example command: - * index=index_A | stats avg(offset) AS avg_offset BY sourcetype - * - * Tree: - * --------------StatsTransformation------------------- - * ---|------------------------|------------------|----------------------------|------------ - * COMMAND_MODE_STATS aggregateFunction t_stats_fieldRenameInstruction t_stats_byInstruction - * ------------------------------------------------|--------------------------|------------- - * --------------------------------------COMMAND_STATS_MODE_AS fieldType--COMMAND_STATS_MODE_BY fieldListType - * */ - public Node visitStatsTransformation(DPLParser.StatsTransformationContext ctx) { - Node rv = statsTransformationEmitCatalyst(ctx); - return rv; - } - - public Node statsTransformationEmitCatalyst(DPLParser.StatsTransformationContext ctx) { - // Process children - // COMMAND_MODE_STATS t_stats_partitions? t_stats_allnum? t_stats_delim? t_stats_agg - for (int i = 0; i < ctx.getChildCount(); ++i) { - ParseTree child = ctx.getChild(i); - LOGGER.debug("Processing child: <{}>", child.getText()); - if (child instanceof TerminalNode) { - LOGGER.debug("typeof child = TerminalNode"); - continue; /* Skip stats keyword */ - } - else if (child instanceof DPLParser.T_stats_aggContext) { - visit(child); - } - // FIXME Implement: t_stats_partitions , t_stats_allnum , t_stats_delim - } - - - List listOfCompleteAggregations = new ArrayList<>(); // contains all aggregate Columns - - for (int i = 0; i < queueOfAggregates.size(); ++i) { - Object item = queueOfAggregates.get(i); - - // If next in queue is a column - if (item instanceof ColumnNode) { - - // Check if out of index - Object nextToItem = null; - if (queueOfAggregates.size()-1 >= i+1) { - nextToItem = queueOfAggregates.get(i + 1); - } - - // Check for fieldRename - if (nextToItem instanceof String) { - listOfCompleteAggregations.add(((ColumnNode)item).getColumn().name((String)nextToItem)); - i++; - } - // No fieldRename - else { - listOfCompleteAggregations.add(((ColumnNode)item).getColumn()); - } - } - } - - statsStep = new StatsStep(listOfCompleteAggregations, listOfByFields); - SortStep sortStep = new SortStep(catCtx, listOfSbc, this.catCtx.getDplRecallSize(), false); - - List steps = new ArrayList<>(); - steps.add(statsStep); - steps.add(sortStep); - return new StepListNode(steps); - } - - // AS fieldType - public Node visitT_stats_fieldRenameInstruction(DPLParser.T_stats_fieldRenameInstructionContext ctx) { - return new StringNode(new Token(Type.STRING, ctx.getChild(1).getText())); - } - - // BY fieldListType - public Node visitT_stats_byInstruction(DPLParser.T_stats_byInstructionContext ctx) { - // Child #0 "BY" - // Child #1 fieldListType - return visit(ctx.getChild(1)); - } - - // fieldListType : fieldType ((COMMA)? fieldType)*? - public Node visitFieldListType(DPLParser.FieldListTypeContext ctx) { - List fields = new ArrayList<>(); - ctx.children.forEach(child -> { - String field = child.getText(); - fields.addAll(Arrays.asList(field.split(","))); - }); - - return new StringListNode(fields); - } - - // t_stats_agg - public Node visitT_stats_agg(DPLParser.T_stats_aggContext ctx) { - AggregateFunction aggregateFunction = new AggregateFunction(catCtx); - - ctx.children.forEach(child -> { - // AS fieldType - if (child instanceof DPLParser.T_stats_fieldRenameInstructionContext) { - LOGGER.debug("typeof child = fieldRenameInstructionCtx"); - queueOfAggregates.add(visit(child).toString()); - } - // BY fieldListType - else if (child instanceof DPLParser.T_stats_byInstructionContext) { - LOGGER.debug("typeof child = byInstructionCtx"); - byFields.addAll(((StringListNode)visit(child)).asList()); - listOfByFields.addAll(byFields.stream().map(functions::col).collect(Collectors.toList())); - listOfSbc.addAll(byFields.stream().map(this::createSbc).collect(Collectors.toList())); - } - // other; aggregateFunction visit - else if (child instanceof DPLParser.AggregateFunctionContext) { - LOGGER.debug("typeof child = AggregateFunctionCtx"); - queueOfAggregates.add((ColumnNode) aggregateFunction.visitAggregateFunction((DPLParser.AggregateFunctionContext) child)); - } - }); - - return null; - - } - - private SortByClause createSbc(String fieldName) { - SortByClause sbc = new SortByClause(); - sbc.setFieldName(fieldName); - sbc.setDescending(false); - sbc.setLimit(this.catCtx.getDplRecallSize()); - sbc.setSortAsType(SortByClause.Type.AUTOMATIC); - return sbc; - } + } + + /* + * Command info: + * Performs the AggregateFunction with given fieldRenameInstruction and byInstruction + * Example command: + * index=index_A | stats avg(offset) AS avg_offset BY sourcetype + * + * Tree: + * --------------StatsTransformation------------------- + * ---|------------------------|------------------|----------------------------|------------ + * COMMAND_MODE_STATS aggregateFunction t_stats_fieldRenameInstruction t_stats_byInstruction + * ------------------------------------------------|--------------------------|------------- + * --------------------------------------COMMAND_STATS_MODE_AS fieldType--COMMAND_STATS_MODE_BY fieldListType + * */ + public Node visitStatsTransformation(DPLParser.StatsTransformationContext ctx) { + Node rv = statsTransformationEmitCatalyst(ctx); + return rv; + } + + public Node statsTransformationEmitCatalyst(DPLParser.StatsTransformationContext ctx) { + // Process children + // COMMAND_MODE_STATS t_stats_partitions? t_stats_allnum? t_stats_delim? t_stats_agg + for (int i = 0; i < ctx.getChildCount(); ++i) { + ParseTree child = ctx.getChild(i); + LOGGER.debug("Processing child: <{}>", child.getText()); + if (child instanceof TerminalNode) { + LOGGER.debug("typeof child = TerminalNode"); + continue; /* Skip stats keyword */ + } + else if (child instanceof DPLParser.T_stats_aggContext) { + visit(child); + } + // FIXME Implement: t_stats_partitions , t_stats_allnum , t_stats_delim + } + + List listOfCompleteAggregations = new ArrayList<>(); // contains all aggregate Columns + + for (int i = 0; i < queueOfAggregates.size(); ++i) { + Object item = queueOfAggregates.get(i); + + // If next in queue is a column + if (item instanceof ColumnNode) { + + // Check if out of index + Object nextToItem = null; + if (queueOfAggregates.size() - 1 >= i + 1) { + nextToItem = queueOfAggregates.get(i + 1); + } + + // Check for fieldRename + if (nextToItem instanceof String) { + listOfCompleteAggregations.add(((ColumnNode) item).getColumn().name((String) nextToItem)); + i++; + } + // No fieldRename + else { + listOfCompleteAggregations.add(((ColumnNode) item).getColumn()); + } + } + } + + statsStep = new StatsStep(listOfCompleteAggregations, listOfByFields); + SortStep sortStep = new SortStep(catCtx, listOfSbc, this.catCtx.getDplRecallSize(), false); + + List steps = new ArrayList<>(); + steps.add(statsStep); + steps.add(sortStep); + return new StepListNode(steps); + } + + // AS fieldType + public Node visitT_stats_fieldRenameInstruction(DPLParser.T_stats_fieldRenameInstructionContext ctx) { + return new StringNode(new Token(Type.STRING, ctx.getChild(1).getText())); + } + + // BY fieldListType + public Node visitT_stats_byInstruction(DPLParser.T_stats_byInstructionContext ctx) { + // Child #0 "BY" + // Child #1 fieldListType + return visit(ctx.getChild(1)); + } + + // fieldListType : fieldType ((COMMA)? fieldType)*? + public Node visitFieldListType(DPLParser.FieldListTypeContext ctx) { + List fields = new ArrayList<>(); + ctx.children.forEach(child -> { + String field = child.getText(); + fields.addAll(Arrays.asList(field.split(","))); + }); + + return new StringListNode(fields); + } + + // t_stats_agg + public Node visitT_stats_agg(DPLParser.T_stats_aggContext ctx) { + AggregateFunction aggregateFunction = new AggregateFunction(catCtx); + + ctx.children + .forEach( + child -> { + // AS fieldType + if (child instanceof DPLParser.T_stats_fieldRenameInstructionContext) { + LOGGER.debug("typeof child = fieldRenameInstructionCtx"); + queueOfAggregates.add(visit(child).toString()); + } + // BY fieldListType + else if (child instanceof DPLParser.T_stats_byInstructionContext) { + LOGGER.debug("typeof child = byInstructionCtx"); + byFields.addAll(((StringListNode) visit(child)).asList()); + listOfByFields + .addAll(byFields.stream().map(functions::col).collect(Collectors.toList())); + listOfSbc.addAll(byFields.stream().map(this::createSbc).collect(Collectors.toList())); + } + // other; aggregateFunction visit + else if (child instanceof DPLParser.AggregateFunctionContext) { + LOGGER.debug("typeof child = AggregateFunctionCtx"); + queueOfAggregates + .add( + (ColumnNode) aggregateFunction + .visitAggregateFunction( + (DPLParser.AggregateFunctionContext) child + ) + ); + } + } + ); + + return null; + + } + + private SortByClause createSbc(String fieldName) { + SortByClause sbc = new SortByClause(); + sbc.setFieldName(fieldName); + sbc.setDescending(false); + sbc.setLimit(this.catCtx.getDplRecallSize()); + sbc.setSortAsType(SortByClause.Type.AUTOMATIC); + return sbc; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/StrcatTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/StrcatTransformation.java index d9266ed..4d073a1 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/StrcatTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/StrcatTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.NullValue; @@ -62,100 +61,104 @@ * Used to concatenate two or more field values and/or string literals into a destination field */ public class StrcatTransformation extends DPLParserBaseVisitor { - private final Logger LOGGER = LoggerFactory.getLogger(StrcatTransformation.class); - public StrcatStep strcatStep = null; - private final NullValue nullValue; - public StrcatTransformation(NullValue nullValue) { - this.nullValue = nullValue; - } + private final Logger LOGGER = LoggerFactory.getLogger(StrcatTransformation.class); + public StrcatStep strcatStep = null; + private final NullValue nullValue; + + public StrcatTransformation(NullValue nullValue) { + this.nullValue = nullValue; + } - /** - *
-	 * -- Command info: --
-	 *
-	 * {@literal strcat [allrequired=]  }
-	 * Concatenates string values from 2 or more fields, combines string values and
-	 * literals into a new field. The destination field name is specified at the end of
-	 * the strcat command. allrequired is not a required argument, and it can be omitted.
-	 *
-	 * -- Grammar rules: --
-	 *
-	 * strcatTransformation
-	 *  : COMMAND_MODE_STRCAT (t_strcat_allrequiredParameter)? t_strcat_srcfieldsParameter fieldType
-	 *  ;
-	 *
-	 * t_strcat_allrequiredParameter
-	 *  : COMMAND_STRCAT_MODE_ALLREQUIRED booleanType
-	 *  ;
-	 *
-	 * t_strcat_srcfieldsParameter
-	 *  : (fieldType | stringType) (fieldType | stringType)+
-	 *  ;
-	 *
-	 * -- SQL: --
-	 *
-	 * strcat allRequired=bool field1 field2 ... fieldN destField
-	 *  to
-	 * SELECT CONCAT(field1, field2, ..., fieldN) AS destField FROM ˇtemporaryDPLViewˇ
-	 *  
- * */ - @Override - public Node visitStrcatTransformation(DPLParser.StrcatTransformationContext ctx) { - LOGGER.debug(String.format("Child count: %s in StrcatTransformation: %s", ctx.getChildCount(), ctx.getText())); - return strcatTransformationEmitCatalyst(ctx); - } + /** + *
+     * -- Command info: --
+     *
+     * {@literal strcat [allrequired=]  }
+     * Concatenates string values from 2 or more fields, combines string values and
+     * literals into a new field. The destination field name is specified at the end of
+     * the strcat command. allrequired is not a required argument, and it can be omitted.
+     *
+     * -- Grammar rules: --
+     *
+     * strcatTransformation
+     *  : COMMAND_MODE_STRCAT (t_strcat_allrequiredParameter)? t_strcat_srcfieldsParameter fieldType
+     *  ;
+     *
+     * t_strcat_allrequiredParameter
+     *  : COMMAND_STRCAT_MODE_ALLREQUIRED booleanType
+     *  ;
+     *
+     * t_strcat_srcfieldsParameter
+     *  : (fieldType | stringType) (fieldType | stringType)+
+     *  ;
+     *
+     * -- SQL: --
+     *
+     * strcat allRequired=bool field1 field2 ... fieldN destField
+     *  to
+     * SELECT CONCAT(field1, field2, ..., fieldN) AS destField FROM ˇtemporaryDPLViewˇ
+     *  
+ */ + @Override + public Node visitStrcatTransformation(DPLParser.StrcatTransformationContext ctx) { + LOGGER.debug(String.format("Child count: %s in StrcatTransformation: %s", ctx.getChildCount(), ctx.getText())); + return strcatTransformationEmitCatalyst(ctx); + } - /** - * Emit catalyst from strcatTransformation - * @param ctx StrcatTransformationContext - * @return CatalystNode containing resultset - */ - public Node strcatTransformationEmitCatalyst(DPLParser.StrcatTransformationContext ctx) { - // syntax: strcat allrequired src-fields dest-field - // child# 0 1 2 3 - this.strcatStep = new StrcatStep(nullValue); - visitChildren(ctx); - return new StepNode(this.strcatStep); - } + /** + * Emit catalyst from strcatTransformation + * + * @param ctx StrcatTransformationContext + * @return CatalystNode containing resultset + */ + public Node strcatTransformationEmitCatalyst(DPLParser.StrcatTransformationContext ctx) { + // syntax: strcat allrequired src-fields dest-field + // child# 0 1 2 3 + this.strcatStep = new StrcatStep(nullValue); + visitChildren(ctx); + return new StepNode(this.strcatStep); + } - /**
-	 * t_strcat_allrequiredParameter
-	 *  : COMMAND_STRCAT_MODE_ALLREQUIRED booleanType
-	 *  ;
-	 *
-	 *  If the parameter exists, the second child (child#1) contains the boolean type whether or not all source fields are required
-	 *  
- */ - @Override - public Node visitT_strcat_allrequiredParameter(DPLParser.T_strcat_allrequiredParameterContext ctx) { - this.strcatStep.setAllRequired(ctx.booleanType().GET_BOOLEAN_TRUE() != null); - return null; - } + /** + *
+     * t_strcat_allrequiredParameter
+     *  : COMMAND_STRCAT_MODE_ALLREQUIRED booleanType
+     *  ;
+     *
+     *  If the parameter exists, the second child (child#1) contains the boolean type whether or not all source fields are required
+     *  
+ */ + @Override + public Node visitT_strcat_allrequiredParameter(DPLParser.T_strcat_allrequiredParameterContext ctx) { + this.strcatStep.setAllRequired(ctx.booleanType().GET_BOOLEAN_TRUE() != null); + return null; + } - /**
-	 * 	t_strcat_srcfieldsParameter
-	 *   : (fieldType | stringType) (fieldType | stringType)+
-	 *   ;
-	 *
-	 *  Contains all the source fields, one or more.
-	 *   Adds all fields into an array, while stripping quotes from each one of the fields.
- */ - @Override - public Node visitT_strcat_srcfieldsParameter(DPLParser.T_strcat_srcfieldsParameterContext ctx) { - List srcFields = new ArrayList<>(); + /** + *
+     * 	t_strcat_srcfieldsParameter
+     *   : (fieldType | stringType) (fieldType | stringType)+
+     *   ;
+     *
+     *  Contains all the source fields, one or more.
+     *   Adds all fields into an array, while stripping quotes from each one of the fields.
+ */ + @Override + public Node visitT_strcat_srcfieldsParameter(DPLParser.T_strcat_srcfieldsParameterContext ctx) { + List srcFields = new ArrayList<>(); - ctx.children.forEach(child -> srcFields.add(child.getText())); + ctx.children.forEach(child -> srcFields.add(child.getText())); - this.strcatStep.setListOfFields(srcFields); - this.strcatStep.setNumberOfSrcFieldsOriginally(ctx.getChildCount()); + this.strcatStep.setListOfFields(srcFields); + this.strcatStep.setNumberOfSrcFieldsOriginally(ctx.getChildCount()); - return null; - } + return null; + } - @Override - public Node visitT_strcat_destfieldParameter(DPLParser.T_strcat_destfieldParameterContext ctx) { - this.strcatStep.setDestField(ctx.fieldType().getText()); - return null; - } + @Override + public Node visitT_strcat_destfieldParameter(DPLParser.T_strcat_destfieldParameterContext ctx) { + this.strcatStep.setDestField(ctx.fieldType().getText()); + return null; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TableTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TableTransformation.java index 96bf1d1..9866508 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TableTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TableTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.TextString; @@ -64,6 +63,7 @@ * Used to generate a table with the same order and fields as given in the command */ public class TableTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(TableTransformation.class); List fieldList = null; public TableStep tableStep = null; @@ -91,7 +91,7 @@ public Node visitT_table_wcfieldListParameter(DPLParser.T_table_wcfieldListParam List listOfFields = new ArrayList<>(); ctx.t_table_fieldType().forEach(fieldType -> { - String fieldName = ((StringNode)visit(fieldType)).toString(); + String fieldName = ((StringNode) visit(fieldType)).toString(); if (!fieldName.equals("")) { listOfFields.addAll(Arrays.asList(fieldName.split(","))); @@ -112,5 +112,4 @@ public Node visitT_table_fieldType(DPLParser.T_table_fieldTypeContext ctx) { return new StringNode(new Token(Token.Type.STRING, fieldName)); } - } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TeragrepTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TeragrepTransformation.java index b1f9c03..70d40d2 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TeragrepTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TeragrepTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.*; @@ -75,6 +74,7 @@ * Class containing the visitor methods for all "| teragrep" subcommands */ public class TeragrepTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepTransformation.class); DPLParserCatalystContext catCtx; DPLParserCatalystVisitor catVisitor; @@ -93,15 +93,14 @@ public class TeragrepTransformation extends DPLParserBaseVisitor { private static final String portCfgItem = "dpl.pth_10.transform.teragrep.syslog.parameter.port"; private static final String enforceDestinationCfgItem = "dpl.pth_10.transform.teragrep.syslog.restrictedMode"; - public TeragrepTransformation(DPLParserCatalystContext catCtx, DPLParserCatalystVisitor catVisitor) - { + public TeragrepTransformation(DPLParserCatalystContext catCtx, DPLParserCatalystVisitor catVisitor) { this.catCtx = catCtx; this.catVisitor = catVisitor; } /** - * Topmost visitor, teragrep subcommand visiting starts from this function - * COMMAND_MODE_TERAGREP (t_modeParameter | t_getParameter) t_hostParameter? + * Topmost visitor, teragrep subcommand visiting starts from this function COMMAND_MODE_TERAGREP (t_modeParameter | + * t_getParameter) t_hostParameter? */ @Override public Node visitTeragrepTransformation(DPLParser.TeragrepTransformationContext ctx) { @@ -109,8 +108,9 @@ public Node visitTeragrepTransformation(DPLParser.TeragrepTransformationContext } /** - * Visits the subrules and sets the parameters based on the parse tree. - * Also uses the zeppelin config to set defaults, if available + * Visits the subrules and sets the parameters based on the parse tree. Also uses the zeppelin config to set + * defaults, if available + * * @param ctx Main parse tree * @return CatalystNode */ @@ -139,8 +139,11 @@ private Node teragrepTransformationEmitCatalyst(DPLParser.TeragrepTransformation } } else { - LOGGER.error("Zeppelin config was not provided to the Teragrep command: host and port will be set as default, {}", - "and no destination will be enforced."); + LOGGER + .error( + "Zeppelin config was not provided to the Teragrep command: host and port will be set as default, {}", + "and no destination will be enforced." + ); } return visit(ctx.getChild(1)); @@ -148,17 +151,20 @@ private Node teragrepTransformationEmitCatalyst(DPLParser.TeragrepTransformation /** * Sets the cmdMode based on the parse tree given
+ * * @param ctx getParameter sub parse tree * @return null, as the function sets a global variable cmdMode */ @Override - public Node visitT_getParameter(DPLParser.T_getParameterContext ctx) { + public Node visitT_getParameter(DPLParser.T_getParameterContext ctx) { // get archive summary OR get system version if (ctx.t_getTeragrepVersionParameter() != null) { return visit(ctx.t_getTeragrepVersionParameter()); - } else if (ctx.t_getArchiveSummaryParameter() != null) { + } + else if (ctx.t_getArchiveSummaryParameter() != null) { return visit(ctx.t_getArchiveSummaryParameter()); - } else { + } + else { throw new IllegalArgumentException("Unsupported teragrep command: " + ctx.getText()); } } @@ -171,8 +177,10 @@ public Node visitT_getArchiveSummaryParameter(DPLParser.T_getArchiveSummaryParam doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); } catch (ParserConfigurationException pce) { - throw new RuntimeException("Error occurred during initialization of XML document in metadata query: <{" + - pce.getMessage() + "}>"); + throw new RuntimeException( + "Error occurred during initialization of XML document in metadata query: <{" + pce.getMessage() + + "}>" + ); } // get metadata via logicalStatement and isMetadataQuery=true LogicalStatementXML logiXml = new LogicalStatementXML(catCtx, doc, true); @@ -190,6 +198,7 @@ public Node visitT_getTeragrepVersionParameter(DPLParser.T_getTeragrepVersionPar /** * Sets the host and port, if given + * * @param ctx hostParameter sub parse tree * @return null, as the function sets the global variables host and port */ @@ -254,20 +263,26 @@ public Node visitT_saveModeParameter(DPLParser.T_saveModeParameterContext ctx) { if (ctx.t_pathParameter() != null && !ctx.t_pathParameter().isEmpty()) { if (ctx.t_pathParameter().size() != 1) { - throw new IllegalArgumentException("Path parameter was provided multiple times! Please provide it only once."); + throw new IllegalArgumentException( + "Path parameter was provided multiple times! Please provide it only once." + ); } hdfsPath = visit(ctx.t_pathParameter(0)).toString(); } if (ctx.t_retentionParameter() != null && !ctx.t_retentionParameter().isEmpty()) { if (ctx.t_retentionParameter().size() != 1) { - throw new IllegalArgumentException("Retention parameter was provided multiple times! Please provide it only once."); + throw new IllegalArgumentException( + "Retention parameter was provided multiple times! Please provide it only once." + ); } hdfsRetentionSpan = ctx.t_retentionParameter(0).spanType().getText(); } if (ctx.t_overwriteParameter() != null && !ctx.t_overwriteParameter().isEmpty()) { if (ctx.t_overwriteParameter().size() != 1) { - throw new IllegalArgumentException("Overwrite parameter was provided multiple times! Please provide it only once."); + throw new IllegalArgumentException( + "Overwrite parameter was provided multiple times! Please provide it only once." + ); } TerminalNode overwriteBoolNode = (TerminalNode) ctx.t_overwriteParameter(0).booleanType().getChild(0); switch (overwriteBoolNode.getSymbol().getType()) { @@ -278,15 +293,19 @@ public Node visitT_saveModeParameter(DPLParser.T_saveModeParameterContext ctx) { hdfsOverwrite = false; break; default: - throw new RuntimeException("Expected a boolean value for parameter 'overwrite', instead it was something else.\n" + - "Try replacing the text after 'overwrite=' with 'true' or 'false'."); + throw new RuntimeException( + "Expected a boolean value for parameter 'overwrite', instead it was something else.\n" + + "Try replacing the text after 'overwrite=' with 'true' or 'false'." + ); } } TeragrepHdfsSaveStep.Format format = TeragrepHdfsSaveStep.Format.AVRO; if (ctx.t_hdfsFormatParameter() != null && !ctx.t_hdfsFormatParameter().isEmpty()) { if (ctx.t_hdfsFormatParameter().size() != 1) { - throw new IllegalArgumentException("'format=' parameter was provided multiple times! Please provide it only once."); + throw new IllegalArgumentException( + "'format=' parameter was provided multiple times! Please provide it only once." + ); } TerminalNode formatNode = (TerminalNode) ctx.t_hdfsFormatParameter(0).getChild(1); @@ -302,7 +321,9 @@ public Node visitT_saveModeParameter(DPLParser.T_saveModeParameterContext ctx) { if (ctx.t_headerParameter() != null && !ctx.t_headerParameter().isEmpty()) { if (ctx.t_headerParameter().size() != 1) { - throw new IllegalArgumentException("'header=' parameter was provided multiple times! Please provide it only once."); + throw new IllegalArgumentException( + "'header=' parameter was provided multiple times! Please provide it only once." + ); } TerminalNode headerNode = (TerminalNode) ctx.t_headerParameter(0).booleanType().getChild(0); switch (headerNode.getSymbol().getType()) { @@ -317,7 +338,9 @@ public Node visitT_saveModeParameter(DPLParser.T_saveModeParameterContext ctx) { } } - return new StepNode(new TeragrepHdfsSaveStep(catCtx, hdfsOverwrite, hdfsPath, hdfsRetentionSpan, format, header)); + return new StepNode( + new TeragrepHdfsSaveStep(catCtx, hdfsOverwrite, hdfsPath, hdfsRetentionSpan, format, header) + ); } // exec hdfs load path @@ -329,7 +352,9 @@ public Node visitT_loadModeParameter(DPLParser.T_loadModeParameterContext ctx) { if (ctx.t_pathParameter() != null && !ctx.t_pathParameter().isEmpty()) { if (ctx.t_pathParameter().size() != 1) { - throw new IllegalArgumentException("Path parameter was provided multiple times! Please provide it only once."); + throw new IllegalArgumentException( + "Path parameter was provided multiple times! Please provide it only once." + ); } hdfsPath = visit(ctx.t_pathParameter(0)).toString(); } @@ -337,7 +362,9 @@ public Node visitT_loadModeParameter(DPLParser.T_loadModeParameterContext ctx) { TeragrepHdfsLoadStep.Format format = TeragrepHdfsLoadStep.Format.AVRO; if (ctx.t_hdfsFormatParameter() != null && !ctx.t_hdfsFormatParameter().isEmpty()) { if (ctx.t_hdfsFormatParameter().size() != 1) { - throw new IllegalArgumentException("'format=' parameter was provided multiple times! Please provide it only once."); + throw new IllegalArgumentException( + "'format=' parameter was provided multiple times! Please provide it only once." + ); } TerminalNode formatNode = (TerminalNode) ctx.t_hdfsFormatParameter(0).getChild(1); switch (formatNode.getSymbol().getType()) { @@ -352,7 +379,9 @@ public Node visitT_loadModeParameter(DPLParser.T_loadModeParameterContext ctx) { if (ctx.t_headerParameter() != null && !ctx.t_headerParameter().isEmpty()) { if (ctx.t_headerParameter().size() != 1) { - throw new IllegalArgumentException("'header=' parameter was provided multiple times! Please provide it only once."); + throw new IllegalArgumentException( + "'header=' parameter was provided multiple times! Please provide it only once." + ); } TerminalNode headerNode = (TerminalNode) ctx.t_headerParameter(0).booleanType().getChild(0); switch (headerNode.getSymbol().getType()) { @@ -368,8 +397,10 @@ public Node visitT_loadModeParameter(DPLParser.T_loadModeParameterContext ctx) { } if (ctx.t_schemaParameter() != null && !ctx.t_schemaParameter().isEmpty()) { - if (ctx.t_schemaParameter().size() != 1){ - throw new IllegalArgumentException("'schema=' parameter was provided multiple times! Please provide it only once."); + if (ctx.t_schemaParameter().size() != 1) { + throw new IllegalArgumentException( + "'schema=' parameter was provided multiple times! Please provide it only once." + ); } schema = new UnquotedText(new TextString(ctx.t_schemaParameter(0).stringType().getText())).read(); } @@ -433,26 +464,35 @@ else if (ctx.t_bloomOptionParameter().COMMAND_TERAGREP_MODE_UPDATE() != null) { // bloom update mode = TeragrepBloomStep.BloomMode.UPDATE; } - else if(ctx.t_bloomOptionParameter().COMMAND_TERAGREP_MODE_ESTIMATE() != null) { + else if (ctx.t_bloomOptionParameter().COMMAND_TERAGREP_MODE_ESTIMATE() != null) { // bloom estimate mode = TeragrepBloomStep.BloomMode.ESTIMATE; } if (ctx.t_bloomOptionParameter().t_inputParamater() != null) { - inputCol = new UnquotedText(new TextString(ctx.t_bloomOptionParameter().t_inputParamater().fieldType().getText())).read(); - } else { + inputCol = new UnquotedText( + new TextString(ctx.t_bloomOptionParameter().t_inputParamater().fieldType().getText()) + ).read(); + } + else { inputCol = "tokens"; } if (ctx.t_bloomOptionParameter().t_outputParameter() != null) { - outputCol = new UnquotedText(new TextString(ctx.t_bloomOptionParameter().t_outputParameter().fieldType().getText())).read(); - } else { + outputCol = new UnquotedText( + new TextString(ctx.t_bloomOptionParameter().t_outputParameter().fieldType().getText()) + ).read(); + } + else { outputCol = String.format("estimate(%s)", inputCol); } if (ctx.t_bloomOptionParameter().t_estimatesParameter() != null) { - estimateCol = new UnquotedText(new TextString(ctx.t_bloomOptionParameter().t_estimatesParameter().fieldType().getText())).read(); - } else { + estimateCol = new UnquotedText( + new TextString(ctx.t_bloomOptionParameter().t_estimatesParameter().fieldType().getText()) + ).read(); + } + else { estimateCol = String.format("estimate(%s)", inputCol); } } @@ -461,8 +501,13 @@ else if(ctx.t_bloomOptionParameter().COMMAND_TERAGREP_MODE_ESTIMATE() != null) { if (mode == TeragrepBloomStep.BloomMode.CREATE || mode == TeragrepBloomStep.BloomMode.UPDATE) { // create aggregate step to run before bloom create and bloom update - TeragrepBloomStep aggregateStep = new TeragrepBloomStep(this.zplnConfig, TeragrepBloomStep.BloomMode.AGGREGATE, - inputCol, outputCol, estimateCol); + TeragrepBloomStep aggregateStep = new TeragrepBloomStep( + this.zplnConfig, + TeragrepBloomStep.BloomMode.AGGREGATE, + inputCol, + outputCol, + estimateCol + ); return new StepListNode(Arrays.asList(aggregateStep, bloomStep)); } @@ -477,13 +522,18 @@ public Node visitT_tokenizerParameter(DPLParser.T_tokenizerParameterContext ctx) String outputCol = "tokens"; AbstractTokenizerStep.TokenizerFormat tokenizerFormat = AbstractTokenizerStep.TokenizerFormat.STRING; if (ctx.t_formatParameter() != null) { - final String format = new UnquotedText(new TextString(ctx.t_formatParameter().stringType().getText())).read(); + final String format = new UnquotedText(new TextString(ctx.t_formatParameter().stringType().getText())) + .read(); if (format.equalsIgnoreCase("string")) { tokenizerFormat = AbstractTokenizerStep.TokenizerFormat.STRING; - } else if (format.equalsIgnoreCase("bytes")) { + } + else if (format.equalsIgnoreCase("bytes")) { tokenizerFormat = AbstractTokenizerStep.TokenizerFormat.BYTES; - } else { - throw new IllegalArgumentException("Invalid format parameter '" + format + "'. Expected 'string' or 'bytes'"); + } + else { + throw new IllegalArgumentException( + "Invalid format parameter '" + format + "'. Expected 'string' or 'bytes'" + ); } } if (ctx.t_inputParamater() != null) { @@ -502,15 +552,20 @@ public Node visitT_dynatraceParameter(DPLParser.T_dynatraceParameterContext ctx) final String url; if (ctx.stringType() != null) { metricKey = new UnquotedText(new TextString(ctx.stringType().getText())).read(); - } else if (catCtx.getNotebookUrl() != null && !catCtx.getNotebookUrl().isEmpty()){ + } + else if (catCtx.getNotebookUrl() != null && !catCtx.getNotebookUrl().isEmpty()) { metricKey = catCtx.getNotebookUrl(); - } else { + } + else { metricKey = "NoteBookID"; } - if (catCtx.getConfig() != null && catCtx.getConfig().hasPath("dpl.pth_10.transform.teragrep.dynatrace.api.url")) { + if ( + catCtx.getConfig() != null && catCtx.getConfig().hasPath("dpl.pth_10.transform.teragrep.dynatrace.api.url") + ) { url = catCtx.getConfig().getString("dpl.pth_10.transform.teragrep.dynatrace.api.url"); - } else { + } + else { url = "http://localhost:9001/metrics/ingest"; } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TimechartTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TimechartTransformation.java index 1c51e77..2ce11d7 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TimechartTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TimechartTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.*; @@ -72,6 +71,7 @@ *
Dataset.groupBy("_time").pivot(aggregateField).sum(fieldname)
*/ public class TimechartTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(TimechartTransformation.class); private DPLParserCatalystContext catCtx = null; private DPLParserCatalystVisitor catVisitor; @@ -95,23 +95,15 @@ public String getAggregateField() { return this.aggregateField; } - - /** - timechartTransformation - : COMMAND_MODE_TIMECHART - (t_timechart_sepParameter)? - (t_timechart_formatParameter)? - (t_timechart_fixedrangeParameter)? - (t_timechart_partialParameter)? - (t_timechart_contParameter)? - (t_timechart_limitParameter)? - (t_timechart_binOptParameter)* - (t_timechart_singleAggregation|t_timechart_intervalInstruction|EVAL_LANGUAGE_MODE_PARENTHESIS_L evalStatement EVAL_LANGUAGE_MODE_PARENTHESIS_R)+ - (t_timechart_divideByInstruction) - ; - */ - @Override public Node visitTimechartTransformation(DPLParser.TimechartTransformationContext ctx) { + * timechartTransformation : COMMAND_MODE_TIMECHART (t_timechart_sepParameter)? (t_timechart_formatParameter)? + * (t_timechart_fixedrangeParameter)? (t_timechart_partialParameter)? (t_timechart_contParameter)? + * (t_timechart_limitParameter)? (t_timechart_binOptParameter)* + * (t_timechart_singleAggregation|t_timechart_intervalInstruction|EVAL_LANGUAGE_MODE_PARENTHESIS_L evalStatement + * EVAL_LANGUAGE_MODE_PARENTHESIS_R)+ (t_timechart_divideByInstruction) ; + */ + @Override + public Node visitTimechartTransformation(DPLParser.TimechartTransformationContext ctx) { return timechartTransformationEmitCatalyst(ctx); } @@ -120,8 +112,7 @@ private Node timechartTransformationEmitCatalyst(DPLParser.TimechartTransformati Column span = null; - - if (ctx.t_timechart_binOptParameter() != null && !ctx.t_timechart_binOptParameter().isEmpty()) { + if (ctx.t_timechart_binOptParameter() != null && !ctx.t_timechart_binOptParameter().isEmpty()) { LOGGER.info("Timechart Optional parameters: <[{}]>", ctx.t_timechart_binOptParameter().get(0).getText()); ColumnNode spanNode = (ColumnNode) visit(ctx.t_timechart_binOptParameter().get(0)); @@ -148,7 +139,9 @@ private Node timechartTransformationEmitCatalyst(DPLParser.TimechartTransformati } } else if (child instanceof DPLParser.T_timechart_divideByInstructionContext) { - String divByInst = ((StringNode)visitT_timechart_divideByInstruction((DPLParser.T_timechart_divideByInstructionContext) child)).toString(); + String divByInst = ((StringNode) visitT_timechart_divideByInstruction( + (DPLParser.T_timechart_divideByInstructionContext) child + )).toString(); listOfDivideByInst.add(divByInst); } else if (child instanceof DPLParser.T_timechart_fieldRenameInstructionContext) { @@ -183,6 +176,7 @@ else if (child instanceof DPLParser.T_timechart_fieldRenameInstructionContext) { /** * Convert span of type Column to the span length in seconds + * * @param span span of type column * @return span length in seconds */ @@ -199,7 +193,8 @@ private long getSpanSeconds(Column span) { if (!isWithinNumber) { break; } - } else if (isWithinNumber && spanChar != ' ') { + } + else if (isWithinNumber && spanChar != ' ') { num.append(spanChar); } } @@ -215,25 +210,26 @@ private long getSpanSeconds(Column span) { @Override public Node visitAggregateFunction(DPLParser.AggregateFunctionContext ctx) { Node rv = aggregateFunction.visitAggregateFunction(ctx); - if(aggregateField == null) + if (aggregateField == null) aggregateField = aggregateFunction.getAggregateField(); return aggregateFunction.visitAggregateFunction(ctx); } @Override - public Node visitT_timechart_divideByInstruction(DPLParser.T_timechart_divideByInstructionContext ctx ){ -// LOGGER.info(ctx.getChildCount()+"--visitT_chart_divideByInstruction incoming{}", ctx.getText()); - if(ctx.getChildCount() == 0){ + public Node visitT_timechart_divideByInstruction(DPLParser.T_timechart_divideByInstructionContext ctx) { + // LOGGER.info(ctx.getChildCount()+"--visitT_chart_divideByInstruction incoming{}", ctx.getText()); + if (ctx.getChildCount() == 0) { return null; } String target = ctx.getChild(1).getChild(0).toString(); if (doc != null) { - Element el = doc.createElement("divideBy"); - el.setAttribute("field", target); - return new ElementNode(el); - } else { - return new StringNode(new Token(Type.STRING, target)); + Element el = doc.createElement("divideBy"); + el.setAttribute("field", target); + return new ElementNode(el); + } + else { + return new StringNode(new Token(Type.STRING, target)); } } @@ -244,18 +240,21 @@ public Node visitT_timechart_fieldRenameInstruction(DPLParser.T_timechart_fieldR Element el = doc.createElement("fieldRename"); el.setAttribute("field", field); return new ElementNode(el); - } else { + } + else { return new StringNode(new Token(Type.STRING, field)); } } - @Override public Node visitT_timechart_binOptParameter(DPLParser.T_timechart_binOptParameterContext ctx) { - LOGGER.info("visitT_timechart_binOptParameter:<{}>",ctx.getText()); + @Override + public Node visitT_timechart_binOptParameter(DPLParser.T_timechart_binOptParameterContext ctx) { + LOGGER.info("visitT_timechart_binOptParameter:<{}>", ctx.getText()); return visitChildren(ctx); } - @Override public Node visitT_timechart_binSpanParameter(DPLParser.T_timechart_binSpanParameterContext ctx) { - LOGGER.info("visitT_timechart_binSpanParameter:<{}>",ctx.getText()); + @Override + public Node visitT_timechart_binSpanParameter(DPLParser.T_timechart_binSpanParameterContext ctx) { + LOGGER.info("visitT_timechart_binSpanParameter:<{}>", ctx.getText()); CalendarInterval ival = getSpanLength(ctx.getChild(1).getText()); Column col = new Column("_time"); Column span = functions.window(col, String.valueOf(ival)); @@ -265,94 +264,96 @@ public Node visitT_timechart_fieldRenameInstruction(DPLParser.T_timechart_fieldR /** * Creates a column with default span of one hour + * * @return */ - private Column createDefaultSpan(){ + private Column createDefaultSpan() { long sec = 0; TimeRange tr = TimeRange.ONE_HOUR; String duration = "1 days"; // Default duration -// LOGGER.info("createDefaultSpan="+catCtx.getTimeRange()); + // LOGGER.info("createDefaultSpan="+catCtx.getTimeRange()); DPLParserConfig pConf = catCtx.getParserConfig(); - if(pConf != null){ - tr=pConf.getTimeRange(); + if (pConf != null) { + tr = pConf.getTimeRange(); } - switch(tr){ - case TEN_SECONDS:{ + switch (tr) { + case TEN_SECONDS: { sec = 10; duration = "10 seconds"; break; } - case ONE_MINUTE:{ + case ONE_MINUTE: { sec = 60; duration = "1 minutes"; break; } - case FIVE_MINUTES:{ - sec = 5*60; + case FIVE_MINUTES: { + sec = 5 * 60; duration = "5 minutes"; break; } - case THIRTY_MINUTES:{ - sec = 30*60; + case THIRTY_MINUTES: { + sec = 30 * 60; duration = "30 minutes"; break; } - case ONE_HOUR:{ + case ONE_HOUR: { sec = 3600; duration = "1 hours"; break; } - case ONE_DAY:{ - sec = 24*3600; + case ONE_DAY: { + sec = 24 * 3600; duration = "1 days"; break; } - case ONE_MONTH:{ - sec = 30*24*3600; + case ONE_MONTH: { + sec = 30 * 24 * 3600; duration = "30 days"; break; } - default :{ + default: { throw new RuntimeException("timechart span duration greater that month is not supported"); } } - CalendarInterval ival = new CalendarInterval(0,0, sec*1000*1000); - return functions.window(new Column("_time"), String.valueOf(ival), duration, "0 minutes"); + CalendarInterval ival = new CalendarInterval(0, 0, sec * 1000 * 1000); + return functions.window(new Column("_time"), String.valueOf(ival), duration, "0 minutes"); } /** * Gets the CalendarInterval of string form span + * * @param value span as string * @return CalendarInterval */ - private CalendarInterval getSpanLength(String value){ + private CalendarInterval getSpanLength(String value) { // incoming span-length consist of // [] // default timescale is sec - String timescale="sec"; + String timescale = "sec"; int numericalValue; int month = 0; long sec = 0; Pattern p = Pattern.compile("\\d+"); Matcher m = p.matcher(value); - if(m.lookingAt()) { + if (m.lookingAt()) { numericalValue = Integer.parseInt(m.group()); String[] parts = value.split(m.group()); - if(parts.length>1) + if (parts.length > 1) timescale = parts[1].trim(); } else { LOGGER.error("Span length error: missing numerical value:<{}>", value); - throw new RuntimeException("getSpanLength, missing numerical value:"+value); + throw new RuntimeException("getSpanLength, missing numerical value:" + value); } // Calculate value - switch(timescale){ + switch (timescale) { case "s": case "sec": case "secs": case "second": case "seconds": - case "S":{ + case "S": { sec = numericalValue; break; } @@ -361,7 +362,7 @@ private CalendarInterval getSpanLength(String value){ case "mins": case "minute": case "minutes": - case "M":{ + case "M": { sec = numericalValue * 60L; break; } @@ -370,54 +371,117 @@ private CalendarInterval getSpanLength(String value){ case "hrs": case "hour": case "hours": - case "H":{ + case "H": { sec = numericalValue * 3600L; break; } case "d": case "day": case "days": - case "D":{ + case "D": { sec = numericalValue * 3600L * 24; break; } case "w": case "week": case "weeks": - case "W":{ + case "W": { sec = numericalValue * 3600L * 24 * 7; break; } case "mon": case "month": case "months": - case "MON":{ + case "MON": { //month = numericalValue; // month is not supported as such, it needs to be changed seconds // use 30 as default month length - sec = (long) numericalValue *30*24*3600; + sec = (long) numericalValue * 30 * 24 * 3600; break; } } - return new CalendarInterval(month, 0, sec*1000*1000L); - } - - @Override public Node visitT_timechart_binsParameter(DPLParser.T_timechart_binsParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_binStartEndParameter(DPLParser.T_timechart_binStartEndParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_contParameter(DPLParser.T_timechart_contParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_fixedrangeParameter(DPLParser.T_timechart_fixedrangeParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_formatParameter(DPLParser.T_timechart_formatParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_limitParameter(DPLParser.T_timechart_limitParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_partialParameter(DPLParser.T_timechart_partialParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_sepParameter(DPLParser.T_timechart_sepParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_binMinspanParameter(DPLParser.T_timechart_binMinspanParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_binAligntimeParameter(DPLParser.T_timechart_binAligntimeParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_whereInstruction(DPLParser.T_timechart_whereInstructionContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_nullstrParameter(DPLParser.T_timechart_nullstrParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_otherstrParameter(DPLParser.T_timechart_otherstrParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_usenullParameter(DPLParser.T_timechart_usenullParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_useotherParameter(DPLParser.T_timechart_useotherParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_evaledField(DPLParser.T_timechart_evaledFieldContext ctx) { return visitChildren(ctx); } + return new CalendarInterval(month, 0, sec * 1000 * 1000L); + } + + @Override + public Node visitT_timechart_binsParameter(DPLParser.T_timechart_binsParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_binStartEndParameter(DPLParser.T_timechart_binStartEndParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_contParameter(DPLParser.T_timechart_contParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_fixedrangeParameter(DPLParser.T_timechart_fixedrangeParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_formatParameter(DPLParser.T_timechart_formatParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_limitParameter(DPLParser.T_timechart_limitParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_partialParameter(DPLParser.T_timechart_partialParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_sepParameter(DPLParser.T_timechart_sepParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_binMinspanParameter(DPLParser.T_timechart_binMinspanParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_binAligntimeParameter(DPLParser.T_timechart_binAligntimeParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_whereInstruction(DPLParser.T_timechart_whereInstructionContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_nullstrParameter(DPLParser.T_timechart_nullstrParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_otherstrParameter(DPLParser.T_timechart_otherstrParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_usenullParameter(DPLParser.T_timechart_usenullParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_useotherParameter(DPLParser.T_timechart_useotherParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_evaledField(DPLParser.T_timechart_evaledFieldContext ctx) { + return visitChildren(ctx); + } /*@Override public Node visitT_timechart_singleAggregation(DPLParser.T_timechart_singleAggregationContext ctx) { String oper = ctx.getText(); @@ -427,7 +491,7 @@ private CalendarInterval getSpanLength(String value){ if(oper.equalsIgnoreCase("count") || oper.equalsIgnoreCase("c")) { aggregateField = "count"; // use default name col = org.apache.spark.sql.functions.count(defaultField); -// LOGGER.info("T_timechart_singleAggregation (Catalyst):{}", col.expr().sql()+" default field="+defaultField); + // LOGGER.info("T_timechart_singleAggregation (Catalyst):{}", col.expr().sql()+" default field="+defaultField); traceBuffer.add("Visit AggregateMethodCount(Catalyst):{}", col.expr().sql()); rv = new ColumnNode(col); }else { @@ -438,17 +502,29 @@ private CalendarInterval getSpanLength(String value){ if(ctx.t_timechart_fieldRenameInstruction() != null){ Node renameCmd = visitT_timechart_fieldRenameInstruction(ctx.t_timechart_fieldRenameInstruction()); aggregateField = renameCmd.toString(); -// rv = new ColumnNode(((ColumnNode) rv).getColumn().as(renameCmd.toString())); + // rv = new ColumnNode(((ColumnNode) rv).getColumn().as(renameCmd.toString())); } return rv; }*/ - @Override public Node visitSpanType(DPLParser.SpanTypeContext ctx) { -// LOGGER.info("visitSpanType:"+ctx.getText()); + @Override + public Node visitSpanType(DPLParser.SpanTypeContext ctx) { + // LOGGER.info("visitSpanType:"+ctx.getText()); return visitChildren(ctx); } - @Override public Node visitT_timechart_aggParameter(DPLParser.T_timechart_aggParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_dedupSplitParameter(DPLParser.T_timechart_dedupSplitParameterContext ctx) { return visitChildren(ctx); } - @Override public Node visitT_timechart_tcOpt(DPLParser.T_timechart_tcOptContext ctx) { return visitChildren(ctx); } + @Override + public Node visitT_timechart_aggParameter(DPLParser.T_timechart_aggParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_dedupSplitParameter(DPLParser.T_timechart_dedupSplitParameterContext ctx) { + return visitChildren(ctx); + } + + @Override + public Node visitT_timechart_tcOpt(DPLParser.T_timechart_tcOptContext ctx) { + return visitChildren(ctx); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TopTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TopTransformation.java index 651f811..dc36763 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TopTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TopTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -56,32 +55,29 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; /** * Contains the visitor methods for the top command
* Limits the dataset to n results */ public class TopTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(TopTransformation.class); DPLParserCatalystContext catCtx = null; public TopStep topStep = null; - public TopTransformation(DPLParserCatalystContext catCtx) - { + public TopTransformation(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; } public Node visitTopTransformation(DPLParser.TopTransformationContext ctx) { - LOGGER.info("TopTransformation incoming: children=<{}> text=<{}>",ctx.getChildCount(), ctx.getText()); + LOGGER.info("TopTransformation incoming: children=<{}> text=<{}>", ctx.getChildCount(), ctx.getText()); return topTransformationEmitCatalyst(ctx); } - public Node topTransformationEmitCatalyst(DPLParser.TopTransformationContext ctx) { int limit = 10; // Default limit @@ -98,12 +94,13 @@ public Node topTransformationEmitCatalyst(DPLParser.TopTransformationContext ctx LOGGER.info("param= <{}>", o.t_top_limitParameter().getChild(1).getText()); limit = Integer.parseInt(o.t_top_limitParameter().integerType().getText()); } - }; + } + ; // Get field list List fields = null; if (ctx.fieldListType() != null) { Node ret = visitFieldListType(ctx.fieldListType()); - fields = ((StringListNode)ret).asList(); + fields = ((StringListNode) ret).asList(); } // step @@ -113,12 +110,11 @@ public Node topTransformationEmitCatalyst(DPLParser.TopTransformationContext ctx return new StepNode(topStep); } - @Override public Node visitFieldListType(DPLParser.FieldListTypeContext ctx) { List fields = new ArrayList<>(); - ctx.children.forEach(f ->{ - String fieldType =visit(f).toString(); + ctx.children.forEach(f -> { + String fieldType = visit(f).toString(); fields.add(fieldType); }); return new StringListNode(fields); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TransformStatement.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TransformStatement.java index 8dd0a5a..8138785 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TransformStatement.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/TransformStatement.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -59,20 +58,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; - /** - * Base statement for all transformation commands, for example - * statistics (stats) command, evaluation (eval) command and Teragrep system commands. + * Base statement for all transformation commands, for example statistics (stats) command, evaluation (eval) command and + * Teragrep system commands. */ public class TransformStatement extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(TransformStatement.class); private final DPLParserCatalystContext catCtx; private final DPLParserCatalystVisitor catVisitor; /** - * Constructor for the TransformStatement. - * Initializes various classes for different transform commands. - * @param catCtx Catalyst context object containing objects like the Zeppelin config. + * Constructor for the TransformStatement. Initializes various classes for different transform commands. + * + * @param catCtx Catalyst context object containing objects like the Zeppelin config. * @param catVisitor Catalyst visitor object used for walking the parse tree. */ public TransformStatement(DPLParserCatalystContext catCtx, DPLParserCatalystVisitor catVisitor) { @@ -89,6 +88,7 @@ public Node visitTransformStatement(DPLParser.TransformStatementContext ctx) { /** * Goes through the transform statement, visiting the given transform commands in the statement. + * * @param ctx TransformStatement context * @return node generated during the walk */ @@ -106,13 +106,15 @@ private Node transformStatementEmitCatalyst(DPLParser.TransformStatementContext // Logging if (leftTree != null) { LOGGER.info("-> Left tree: text=<{}>", leftTree.getText()); - } else { + } + else { LOGGER.info("-> Left tree NULL"); } if (rightTree != null) { LOGGER.info("-> Right tree: text=<{}>", rightTree.getText()); - } else { + } + else { LOGGER.info("-> Right tree NULL"); } @@ -122,14 +124,18 @@ private Node transformStatementEmitCatalyst(DPLParser.TransformStatementContext if (left != null) { if (left instanceof StepNode) { LOGGER.debug("Add step to list"); - this.catVisitor.getStepList().add(((StepNode)left).get()); + this.catVisitor.getStepList().add(((StepNode) left).get()); } else if (left instanceof StepListNode) { LOGGER.debug("Add multiple steps to list"); - ((StepListNode)left).asList().forEach(step -> this.catVisitor.getStepList().add(step)); + ((StepListNode) left).asList().forEach(step -> this.catVisitor.getStepList().add(step)); } else { - LOGGER.error("visit of leftTree did not return Step(List)Node, instead got: class=<{}>", left.getClass().getName()); + LOGGER + .error( + "visit of leftTree did not return Step(List)Node, instead got: class=<{}>", + left.getClass().getName() + ); } // Add right branch if (rightTree != null) { @@ -137,15 +143,18 @@ else if (left instanceof StepListNode) { if (right != null) { LOGGER.debug("Right side was not null: <{}>", right); left = right; - } else { + } + else { LOGGER.debug("transformStatement "); } - } else { // EOF, return only left + } + else { // EOF, return only left LOGGER.debug("transformStatement return only left transformation"); } return left; - } else { + } + else { // If null is returned, the command is not implemented. // All implemented commands return a StepNode or a StepListNode. throw new IllegalArgumentException("The provided command '" + ctx.getText() + "' is not yet implemented."); @@ -206,6 +215,7 @@ public Node visitStrcatTransformation(DPLParser.StrcatTransformationContext ctx) // strcat command return new StrcatTransformation(catCtx.nullValue).visitStrcatTransformation(ctx); } + @Override public Node visitStatsTransformation(DPLParser.StatsTransformationContext ctx) { // stats command diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/WhereTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/WhereTransformation.java index 4bb00e8..634e255 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/WhereTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/WhereTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -61,14 +60,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.List; - /** * Class containing all the visitor methods for the where command
- * Can be piped like {@literal ... | where col > 1} to limit the results to only - * the values where the statement is true + * Can be piped like {@literal ... | where col > 1} to limit the results to only the values where the + * statement is true */ public class WhereTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(WhereTransformation.class); DPLParserCatalystContext catCtx = null; @@ -90,7 +88,7 @@ public WhereTransformation(DPLParserCatalystContext catCtx) { public Node visitWhereTransformation(DPLParser.WhereTransformationContext ctx) { this.whereStep = new WhereStep(); - ColumnNode cn = (ColumnNode)whereTransformationEmitCatalyst(ctx); + ColumnNode cn = (ColumnNode) whereTransformationEmitCatalyst(ctx); this.whereStep.setWhereColumn(cn.getColumn()); LOGGER.info("Set whereStep column to: <{}>", cn.getColumn().expr().sql()); @@ -111,20 +109,22 @@ private Node whereTransformationEmitCatalyst(DPLParser.WhereTransformationContex Node n = evalStatement.visit(ctx.getChild(1)); String sql = null; if (n instanceof ColumnNode) { - Column whereCol = ((ColumnNode)n).getColumn(); + Column whereCol = ((ColumnNode) n).getColumn(); // apply NOT if it was present if (isNot) { n = new ColumnNode(functions.not(whereCol)); } sql = whereCol.expr().sql(); LOGGER.info("WhereTransformation(Catalyst) out: children=<{}> sql=<{}>", ctx.getChildCount(), sql); - } else { + } + else { if (n != null) throw new RuntimeException( - "Where transformation operation not supported for type:" + n.getClass().getName()+" value="+n.toString()); + "Where transformation operation not supported for type:" + n.getClass().getName() + " value=" + + n.toString() + ); else - throw new RuntimeException( - "Where transformation operation not supported for type:" + n); + throw new RuntimeException("Where transformation operation not supported for type:" + n); } return n; } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/XmlkvTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/XmlkvTransformation.java index 205460e..6792eae 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/XmlkvTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/XmlkvTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -57,9 +57,11 @@ import org.slf4j.LoggerFactory; public class XmlkvTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(XmlkvTransformation.class); private final DPLParserCatalystContext catCtx; public XmlkvStep xmlkvStep; + public XmlkvTransformation(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; } @@ -89,4 +91,4 @@ public Node visitXmlkvTransformation(DPLParser.XmlkvTransformationContext ctx) { return new StepNode(xmlkvStep); } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/AccumTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/AccumTransformation.java index 76af9f7..40aaa6c 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/AccumTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/AccumTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.accum; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -59,50 +58,52 @@ import org.slf4j.LoggerFactory; public class AccumTransformation extends DPLParserBaseVisitor { - private static final Logger LOGGER = LoggerFactory.getLogger(AccumTransformation.class); - public AccumStep accumStep; - public String newFieldName; - public String sourceField; - private final DPLParserCatalystContext catCtx; - public AccumTransformation(DPLParserCatalystContext catCtx) { - this.catCtx = catCtx; - this.newFieldName = ""; - this.sourceField = ""; - } - /* - * -- Command info: -- - * - * accum [AS ] - * - * Each event where is a number, the accum command calculates a running total or sum of the numbers. - * The results can be returned to the same field or a new field - * - * Example: - * Origin field | Result field - * 1 1 - * 2 3 - * 3 6 - * 4 10 - * 5 15 - * - */ - @Override - public Node visitAccumTransformation(DPLParser.AccumTransformationContext ctx) { - visitChildren(ctx); - this.accumStep = new AccumStep(catCtx.nullValue, sourceField, newFieldName); - return new StepNode(this.accumStep); - } + private static final Logger LOGGER = LoggerFactory.getLogger(AccumTransformation.class); + public AccumStep accumStep; + public String newFieldName; + public String sourceField; + private final DPLParserCatalystContext catCtx; + + public AccumTransformation(DPLParserCatalystContext catCtx) { + this.catCtx = catCtx; + this.newFieldName = ""; + this.sourceField = ""; + } + + /* + * -- Command info: -- + * + * accum [AS ] + * + * Each event where is a number, the accum command calculates a running total or sum of the numbers. + * The results can be returned to the same field or a new field + * + * Example: + * Origin field | Result field + * 1 1 + * 2 3 + * 3 6 + * 4 10 + * 5 15 + * + */ + @Override + public Node visitAccumTransformation(DPLParser.AccumTransformationContext ctx) { + visitChildren(ctx); + this.accumStep = new AccumStep(catCtx.nullValue, sourceField, newFieldName); + return new StepNode(this.accumStep); + } - @Override - public Node visitT_accum_fieldRenameInstruction(DPLParser.T_accum_fieldRenameInstructionContext ctx) { - this.newFieldName = new UnquotedText(new TextString(ctx.fieldType().getText())).read(); - return new NullNode(); - } + @Override + public Node visitT_accum_fieldRenameInstruction(DPLParser.T_accum_fieldRenameInstructionContext ctx) { + this.newFieldName = new UnquotedText(new TextString(ctx.fieldType().getText())).read(); + return new NullNode(); + } - @Override - public Node visitFieldType(DPLParser.FieldTypeContext ctx) { - this.sourceField = new UnquotedText(new TextString(ctx.getText())).read(); - return new NullNode(); - } + @Override + public Node visitFieldType(DPLParser.FieldTypeContext ctx) { + this.sourceField = new UnquotedText(new TextString(ctx.getText())).read(); + return new NullNode(); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/AccumulatedSum.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/AccumulatedSum.java index 90da004..5752140 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/AccumulatedSum.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/AccumulatedSum.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.accum; import java.io.Serializable; @@ -51,29 +50,30 @@ import java.util.List; /*** - * A class that provides functionality to collect numeric values of type Long and - * calculate the sum from the current pool of numbers. + * A class that provides functionality to collect numeric values of type Long and calculate the sum from the current + * pool of numbers. + * * @author eemhu - * */ public class AccumulatedSum implements Serializable { - private static final long serialVersionUID = 1L; - private List listOfValues; - - public AccumulatedSum() { - this.listOfValues = new ArrayList(); - } - - public void addNumber(Long n) { - this.listOfValues.add(n); - } - - public Long calculateSum() { - Long sum = 0L; - for (Long val : this.listOfValues) { - sum += val; - } - //LOGGER.info(String.format("AccumulatedSum returned with the value of %s", sum)); - return sum; - } + + private static final long serialVersionUID = 1L; + private List listOfValues; + + public AccumulatedSum() { + this.listOfValues = new ArrayList(); + } + + public void addNumber(Long n) { + this.listOfValues.add(n); + } + + public Long calculateSum() { + Long sum = 0L; + for (Long val : this.listOfValues) { + sum += val; + } + //LOGGER.info(String.format("AccumulatedSum returned with the value of %s", sum)); + return sum; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/BatchCollector.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/BatchCollector.java index 9b85e0c..8d07fce 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/BatchCollector.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/accum/BatchCollector.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.accum; import org.apache.spark.sql.Dataset; @@ -60,84 +59,79 @@ import java.util.concurrent.ConcurrentSkipListMap; public class BatchCollector { - private static final Logger LOGGER = LoggerFactory.getLogger(BatchCollector.class); - - private final ConcurrentSkipListMap> mapOfCollected; - private int rowsCollected; - private int numberOfRowsToCollect; - - private StructType batchSchema = null; - private boolean ordered = true; - - public BatchCollector() { - this.mapOfCollected = new ConcurrentSkipListMap<>(); - this.rowsCollected = 0; - //this.numberOfRowsToCollect = numberOfRowsToCollect; - this.ordered = true; - } - - public BatchCollector(boolean ordered) { - this.mapOfCollected = new ConcurrentSkipListMap<>(); - this.rowsCollected = 0; - //this.numberOfRowsToCollect = numberOfRowsToCollect; - this.ordered = ordered; - } - - public void collect(Dataset batchDF, Long batchId) { - List rowsFromBatch = null; - this.batchSchema = batchDF.schema(); - - if (ordered) { - rowsFromBatch = batchDF - .orderBy(functions.col("_time").desc()) - .repartition(1) - .collectAsList(); - } - else { - rowsFromBatch = batchDF - .repartition(1) - .collectAsList(); - } - - - rowsFromBatch.forEach(row -> { - insert(row); - }); - } - - public StructType getBatchSchema() { - return this.batchSchema; - } - - public void insert(Row rowToInsert) { - Timestamp timestampOfRow = rowToInsert.getTimestamp(0); - rowsCollected++; - - // Key exists in map, add the rowToInsert to the List of rows for the timestamp - if (mapOfCollected.get(timestampOfRow) != null) { - mapOfCollected.get(timestampOfRow).add(rowToInsert); - } - // Key does not exist in map, add new Key (Timestamp), Value (List) pair to map - else { - LinkedList listOfRows = new LinkedList(); - listOfRows.add(rowToInsert); - mapOfCollected.put(timestampOfRow, listOfRows); - } - } - - public void printCollected() { - mapOfCollected.values().forEach(value -> { - LOGGER.info(value.toString()); - }); - } - - public ArrayList toList() { - ArrayList rv = new ArrayList<>(); - - mapOfCollected.values().forEach(rowList -> { - rv.addAll(rowList); - }); - - return rv; - } + + private static final Logger LOGGER = LoggerFactory.getLogger(BatchCollector.class); + + private final ConcurrentSkipListMap> mapOfCollected; + private int rowsCollected; + private int numberOfRowsToCollect; + + private StructType batchSchema = null; + private boolean ordered = true; + + public BatchCollector() { + this.mapOfCollected = new ConcurrentSkipListMap<>(); + this.rowsCollected = 0; + //this.numberOfRowsToCollect = numberOfRowsToCollect; + this.ordered = true; + } + + public BatchCollector(boolean ordered) { + this.mapOfCollected = new ConcurrentSkipListMap<>(); + this.rowsCollected = 0; + //this.numberOfRowsToCollect = numberOfRowsToCollect; + this.ordered = ordered; + } + + public void collect(Dataset batchDF, Long batchId) { + List rowsFromBatch = null; + this.batchSchema = batchDF.schema(); + + if (ordered) { + rowsFromBatch = batchDF.orderBy(functions.col("_time").desc()).repartition(1).collectAsList(); + } + else { + rowsFromBatch = batchDF.repartition(1).collectAsList(); + } + + rowsFromBatch.forEach(row -> { + insert(row); + }); + } + + public StructType getBatchSchema() { + return this.batchSchema; + } + + public void insert(Row rowToInsert) { + Timestamp timestampOfRow = rowToInsert.getTimestamp(0); + rowsCollected++; + + // Key exists in map, add the rowToInsert to the List of rows for the timestamp + if (mapOfCollected.get(timestampOfRow) != null) { + mapOfCollected.get(timestampOfRow).add(rowToInsert); + } + // Key does not exist in map, add new Key (Timestamp), Value (List) pair to map + else { + LinkedList listOfRows = new LinkedList(); + listOfRows.add(rowToInsert); + mapOfCollected.put(timestampOfRow, listOfRows); + } + } + + public void printCollected() { + mapOfCollected.values().forEach(value -> { + LOGGER.info(value.toString()); + }); + } + + public ArrayList toList() { + ArrayList rv = new ArrayList<>(); + + mapOfCollected.values().forEach(rowList -> { + rv.addAll(rowList); + }); + + return rv; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/addtotals/AddtotalsUDF.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/addtotals/AddtotalsUDF.java index 0cac9c7..f9e8f35 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/addtotals/AddtotalsUDF.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/addtotals/AddtotalsUDF.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -51,6 +51,7 @@ // source-field, public class AddtotalsUDF implements UDF1 { + @Override public String call(Object o) throws Exception { final TypeParser tp = new TypeParser(); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Auto.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Auto.java index 6da578b..7b91089 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Auto.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Auto.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,16 +43,17 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.convert; import org.apache.spark.sql.api.java.UDF1; /** * UDF used for convert command 'auto()' + * * @author eemhu */ public class Auto implements UDF1 { + /** * @param s Input string to be converted to a number using best conversion * @return Input string as a number diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Ctime.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Ctime.java index a74b503..5e4ce80 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Ctime.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Ctime.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.convert; import com.teragrep.pth10.ast.DPLTimeFormat; @@ -56,23 +55,23 @@ /** * UDF for convert command 'ctime'
* Converts epoch time into given timeformat
+ * * @author eemhu - * */ -public class Ctime implements UDF2{ +public class Ctime implements UDF2 { + + private static final long serialVersionUID = 1L; - private static final long serialVersionUID = 1L; + @Override + public String call(String epoch, String tf) throws Exception { + Long e = Long.valueOf(epoch); - @Override - public String call(String epoch, String tf) throws Exception { - Long e = Long.valueOf(epoch); - - Date date = new Date(e * 1000L); - DateFormat format = new DPLTimeFormat(tf).createSimpleDateFormat(); - format.setTimeZone(TimeZone.getTimeZone("Etc/UTC")); - String formatted = format.format(date); + Date date = new Date(e * 1000L); + DateFormat format = new DPLTimeFormat(tf).createSimpleDateFormat(); + format.setTimeZone(TimeZone.getTimeZone("Etc/UTC")); + String formatted = format.format(date); - return formatted; - } + return formatted; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Dur2Sec.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Dur2Sec.java index c47ee9c..8b94616 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Dur2Sec.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Dur2Sec.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,41 +43,42 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.convert; import org.apache.spark.sql.api.java.UDF1; /** * Converts duration to seconds + * * @author eemhu - * */ public class Dur2Sec implements UDF1 { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; - @Override - public String call(String duration) throws Exception { - // duration is in format [D+]HH:MM:SS - // split based on colon (':') - String[] parts = duration.split(":"); + @Override + public String call(String duration) throws Exception { + // duration is in format [D+]HH:MM:SS + // split based on colon (':') + String[] parts = duration.split(":"); - long hrs = 0L, min = 0L, sec = 0L; - if (parts.length == 3) { - hrs = Long.valueOf(parts[0]); - min = Long.valueOf(parts[1]); - sec = Long.valueOf(parts[2]); - } - else { - throw new RuntimeException("Duration value '" + duration + "' is not of valid format. Expected: [D+]HH:MM:SS"); - } + long hrs = 0L, min = 0L, sec = 0L; + if (parts.length == 3) { + hrs = Long.valueOf(parts[0]); + min = Long.valueOf(parts[1]); + sec = Long.valueOf(parts[2]); + } + else { + throw new RuntimeException( + "Duration value '" + duration + "' is not of valid format. Expected: [D+]HH:MM:SS" + ); + } - // add minutes and hours to seconds - sec += (min * 60L); - sec += (hrs * 60L * 60L); + // add minutes and hours to seconds + sec += (min * 60L); + sec += (hrs * 60L * 60L); - return String.valueOf(sec); - } + return String.valueOf(sec); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Memk.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Memk.java index f55d355..5e7f614 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Memk.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Memk.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.convert; import org.apache.spark.sql.api.java.UDF1; @@ -54,39 +53,42 @@ /** * Positive int or float to kilobytes.
* Default is kb to kb, can be k, m or g + * * @author eemhu - * */ public class Memk implements UDF1 { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; + + @Override + public String call(String input) throws Exception { + // (\d+(.\d+)?(k|m|g)?) + Matcher m = Pattern.compile("\\d+(.\\d+)?").matcher(input); + + if (!m.find()) { + throw new RuntimeException( + "Invalid value given for function memk(). Expected: Positive integer or float, with an optional unit k, m or g. Instead got: " + + input + ); + } + + String numberPart = m.group(); + float number = Float.parseFloat(numberPart); + String unit = input.substring(numberPart.length()); - @Override - public String call(String input) throws Exception { - // (\d+(.\d+)?(k|m|g)?) - Matcher m = Pattern.compile("\\d+(.\\d+)?").matcher(input); - - if (!m.find()) { - throw new RuntimeException("Invalid value given for function memk(). Expected: Positive integer or float, with an optional unit k, m or g. Instead got: " + input); - } - - String numberPart = m.group(); - float number = Float.parseFloat(numberPart); - String unit = input.substring(numberPart.length()); - - if (unit.equalsIgnoreCase("k")) { - return String.valueOf(number); - } - else if (unit.equalsIgnoreCase("m")) { - return String.valueOf(number * 1024f); - } - else if (unit.equalsIgnoreCase("g")) { - return String.valueOf(number * 1024f * 1024f); - } - else { - // invalid unit, default to "k" - return String.valueOf(number); - } - } + if (unit.equalsIgnoreCase("k")) { + return String.valueOf(number); + } + else if (unit.equalsIgnoreCase("m")) { + return String.valueOf(number * 1024f); + } + else if (unit.equalsIgnoreCase("g")) { + return String.valueOf(number * 1024f * 1024f); + } + else { + // invalid unit, default to "k" + return String.valueOf(number); + } + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Mktime.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Mktime.java index ce9d02c..f18cadb 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Mktime.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Mktime.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.convert; import com.teragrep.pth10.ast.DPLTimeFormat; @@ -52,19 +51,19 @@ /** * UDF for convert command 'mktime'
* Human readable time to epoch using given timeformat
+ * * @author eemhu - * */ public class Mktime implements UDF2 { - - private static final long serialVersionUID = 1L; - @Override - public String call(String hrt, String tf) throws Exception { - DPLTimeFormat format = new DPLTimeFormat(tf); - Long rv = format.getEpoch(hrt); + private static final long serialVersionUID = 1L; + + @Override + public String call(String hrt, String tf) throws Exception { + DPLTimeFormat format = new DPLTimeFormat(tf); + Long rv = format.getEpoch(hrt); - return rv.toString(); - } + return rv.toString(); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Mstime.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Mstime.java index 718cda8..234adf1 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Mstime.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Mstime.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.convert; import org.apache.spark.sql.api.java.UDF1; @@ -51,37 +50,38 @@ /** * UDF for convert command 'mstime'
* Human readable time ([MM:]SS.SSS) to epoch using given timeformat
+ * * @author eemhu - * */ public class Mstime implements UDF1 { - - private static final long serialVersionUID = 1L; - @Override - public String call(String duration) throws Exception { - // duration is in format [MM:]SS.SSS - // split based on colon (':') and period ('.') - - String[] parts = duration.split("\\."); // MM:SS and SSS parts - String[] minutesAndSeconds = parts[0].split(":"); // separate MM and SS - - long min = 0; long sec = 0; - if (minutesAndSeconds.length > 1) { // if minutes present - min = Long.valueOf(minutesAndSeconds[0]); - sec = Long.valueOf(minutesAndSeconds[1]); - } - else { // no minutes, just sec and millisec - sec = Long.valueOf(minutesAndSeconds[0]); - } + private static final long serialVersionUID = 1L; + + @Override + public String call(String duration) throws Exception { + // duration is in format [MM:]SS.SSS + // split based on colon (':') and period ('.') + + String[] parts = duration.split("\\."); // MM:SS and SSS parts + String[] minutesAndSeconds = parts[0].split(":"); // separate MM and SS + + long min = 0; + long sec = 0; + if (minutesAndSeconds.length > 1) { // if minutes present + min = Long.valueOf(minutesAndSeconds[0]); + sec = Long.valueOf(minutesAndSeconds[1]); + } + else { // no minutes, just sec and millisec + sec = Long.valueOf(minutesAndSeconds[0]); + } + + long ms = Long.valueOf(parts[1]); + + // add everything up to milliseconds + ms += min * 60L * 1000L; + ms += sec * 1000L; - long ms = Long.valueOf(parts[1]); - - // add everything up to milliseconds - ms += min * 60L * 1000L; - ms += sec * 1000L; - - return String.valueOf(ms); - } + return String.valueOf(ms); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Rmunit.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Rmunit.java index d69cefd..ea45f92 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Rmunit.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/convert/Rmunit.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.convert; import org.apache.spark.sql.api.java.UDF1; @@ -61,188 +60,189 @@ * Exponential numbers are supported, and numbers beginning with +/- signs and dots (.)
* Also numbers beginning with 0 are supported.
* Only one decimal place is supported. (e.g. 100.000.000 is not a valid number). + * * @author eemhu - * */ public class Rmunit implements UDF1 { - - private static final long serialVersionUID = 1L; - private static final Logger LOGGER = LoggerFactory.getLogger(Rmunit.class); - private static final Pattern p = Pattern.compile("^(\\+|-)?[0-9]*((\\.)[0-9]*)?((e|E)[0-9]+)?"); - - @Override - public String call(String value) throws Exception { - String rv; - final StringBuilder sb = new StringBuilder(); - - // special cases - boolean firstWasZero = false; - boolean secondWasZero = false; - - // match against pattern (+|-)? [0-9]* ((.)? [0-9]*)* (e[0-9]+)? - Matcher m = p.matcher(value); - if (!m.find()) { - return ""; // on fail, return empty - } - - // Go through input, removing trailing characters if needed - for (int i = 0; i < value.length(); i++) { - char c = value.charAt(i); - if (i == 0) { - // first one can be ./+/-/digit - if (c == '0') { - firstWasZero = true; - sb.append(c); - continue; - } - else if (c == '+' || c == '-' || c == '.' || Character.isDigit(c)) { - sb.append(c); - continue; - } - else { - return ""; - } - } - else if (i == 1) { - //Second one can be ./digit - if (c == '.') { - if (firstWasZero) { - // 0. -> . - sb.replace(0, 1, "."); - } else { - sb.append("."); - } - continue; - } - else if (c == '0') { - secondWasZero = true; - sb.append(c); - continue; - } - else if (Character.isDigit(c)) { - // this is ok - sb.append(c); - continue; - } - else { - return ""; - } - } - else if (i==2) { - if (c == '.' && secondWasZero && !firstWasZero) { - // (+/-)0. -> (+/-). - sb.replace(1, 2, "."); - continue; - } - } - else if (!Character.isDigit(c) && c != '+' && c != '-' && c != '.' && c != 'e' && c != 'E') { - // break off on non-digit (excluding +/-/./e/E) - break; - } - - sb.append(c); - } - - // check final char (special case: '0.000e'; trailing exponent) - String cleanedUpString = sb.toString(); - char finalChar = cleanedUpString.charAt(cleanedUpString.length()-1); - if (finalChar == 'e' || finalChar == 'E') { - cleanedUpString = cleanedUpString.substring(0, cleanedUpString.length()-1); - } - - // build pattern used by DecimalFormat - StringBuilder pattern = new StringBuilder(); - boolean previousCharWasExponent = false; - char exponentSign = ' '; - for (int i = 0; i < cleanedUpString.length(); i++) { - char c = cleanedUpString.charAt(i); - if (i == 0) { - if (c == '+' || c == '.') { // '-' can't be added, as DecimalFormat automatically adds it - pattern.append(c); - } - else if (Character.isDigit(c)) { - pattern.append('0'); - } - } - else if (i == 1) { - if (c == '.') { - pattern.append(c); - } - else if (Character.isDigit(c)){ - pattern.append('0'); - } - } - else { - if (Character.isDigit(c)) { - pattern.append('0'); - } - else if (c == 'e' || c == 'E') { // DecimalFormat expects capitalized E - pattern.append('E'); - previousCharWasExponent=true; - } - else if (previousCharWasExponent) { - // don't add +/- after E in pattern since DecimalFormat does not expect them - if (c != '+' && c != '-') { - sb.append(c); - } - else { - exponentSign=c; - } - previousCharWasExponent=false; - } - else { - pattern.append(c); - } - } - } - - LOGGER.debug("rmunit-Convert: <{}>", cleanedUpString); - LOGGER.debug("rmunit-Pattern: <{}>", pattern); - try { - rv = new DecimalFormat(pattern.toString()).format(new BigDecimal(cleanedUpString)); - - // Even though the pattern is "0.00E0", DecimalFormat causes "0.45E2" to be converted into ".45E2" - // which is incorrect. These checks will avoid that. - if (firstWasZero) { - if (rv.charAt(0) != '0') { - rv = '0' + rv; - } - } - - if (secondWasZero) { - if (rv.charAt(1) != '0') { - String leftSide = rv.substring(0, 1); - String rightSide = rv.substring(1); - rv = leftSide + '0' + rightSide; - } - } - - // DecimalFormat ignores E+ like this: - // E+00 -> E00 so add + back - // E-00 -> E-00, no need to add - if (exponentSign == '+') { - int smallE = rv.indexOf("e"); - int bigE = rv.indexOf("E"); - - if (smallE != -1) { - String leftSide = rv.substring(0, smallE+1); - String rightSide = rv.substring(smallE+1); - rv = leftSide + exponentSign + rightSide; - - } - else if (bigE != -1) { - String leftSide = rv.substring(0, bigE+1); - String rightSide = rv.substring(bigE+1); - rv = leftSide + exponentSign + rightSide; - } - } - } - catch (Exception e) { - // return empty on fail - LOGGER.error(e.getMessage()); - rv = ""; - } - - return rv; - } + + private static final long serialVersionUID = 1L; + private static final Logger LOGGER = LoggerFactory.getLogger(Rmunit.class); + private static final Pattern p = Pattern.compile("^(\\+|-)?[0-9]*((\\.)[0-9]*)?((e|E)[0-9]+)?"); + + @Override + public String call(String value) throws Exception { + String rv; + final StringBuilder sb = new StringBuilder(); + + // special cases + boolean firstWasZero = false; + boolean secondWasZero = false; + + // match against pattern (+|-)? [0-9]* ((.)? [0-9]*)* (e[0-9]+)? + Matcher m = p.matcher(value); + if (!m.find()) { + return ""; // on fail, return empty + } + + // Go through input, removing trailing characters if needed + for (int i = 0; i < value.length(); i++) { + char c = value.charAt(i); + if (i == 0) { + // first one can be ./+/-/digit + if (c == '0') { + firstWasZero = true; + sb.append(c); + continue; + } + else if (c == '+' || c == '-' || c == '.' || Character.isDigit(c)) { + sb.append(c); + continue; + } + else { + return ""; + } + } + else if (i == 1) { + //Second one can be ./digit + if (c == '.') { + if (firstWasZero) { + // 0. -> . + sb.replace(0, 1, "."); + } + else { + sb.append("."); + } + continue; + } + else if (c == '0') { + secondWasZero = true; + sb.append(c); + continue; + } + else if (Character.isDigit(c)) { + // this is ok + sb.append(c); + continue; + } + else { + return ""; + } + } + else if (i == 2) { + if (c == '.' && secondWasZero && !firstWasZero) { + // (+/-)0. -> (+/-). + sb.replace(1, 2, "."); + continue; + } + } + else if (!Character.isDigit(c) && c != '+' && c != '-' && c != '.' && c != 'e' && c != 'E') { + // break off on non-digit (excluding +/-/./e/E) + break; + } + + sb.append(c); + } + + // check final char (special case: '0.000e'; trailing exponent) + String cleanedUpString = sb.toString(); + char finalChar = cleanedUpString.charAt(cleanedUpString.length() - 1); + if (finalChar == 'e' || finalChar == 'E') { + cleanedUpString = cleanedUpString.substring(0, cleanedUpString.length() - 1); + } + + // build pattern used by DecimalFormat + StringBuilder pattern = new StringBuilder(); + boolean previousCharWasExponent = false; + char exponentSign = ' '; + for (int i = 0; i < cleanedUpString.length(); i++) { + char c = cleanedUpString.charAt(i); + if (i == 0) { + if (c == '+' || c == '.') { // '-' can't be added, as DecimalFormat automatically adds it + pattern.append(c); + } + else if (Character.isDigit(c)) { + pattern.append('0'); + } + } + else if (i == 1) { + if (c == '.') { + pattern.append(c); + } + else if (Character.isDigit(c)) { + pattern.append('0'); + } + } + else { + if (Character.isDigit(c)) { + pattern.append('0'); + } + else if (c == 'e' || c == 'E') { // DecimalFormat expects capitalized E + pattern.append('E'); + previousCharWasExponent = true; + } + else if (previousCharWasExponent) { + // don't add +/- after E in pattern since DecimalFormat does not expect them + if (c != '+' && c != '-') { + sb.append(c); + } + else { + exponentSign = c; + } + previousCharWasExponent = false; + } + else { + pattern.append(c); + } + } + } + + LOGGER.debug("rmunit-Convert: <{}>", cleanedUpString); + LOGGER.debug("rmunit-Pattern: <{}>", pattern); + try { + rv = new DecimalFormat(pattern.toString()).format(new BigDecimal(cleanedUpString)); + + // Even though the pattern is "0.00E0", DecimalFormat causes "0.45E2" to be converted into ".45E2" + // which is incorrect. These checks will avoid that. + if (firstWasZero) { + if (rv.charAt(0) != '0') { + rv = '0' + rv; + } + } + + if (secondWasZero) { + if (rv.charAt(1) != '0') { + String leftSide = rv.substring(0, 1); + String rightSide = rv.substring(1); + rv = leftSide + '0' + rightSide; + } + } + + // DecimalFormat ignores E+ like this: + // E+00 -> E00 so add + back + // E-00 -> E-00, no need to add + if (exponentSign == '+') { + int smallE = rv.indexOf("e"); + int bigE = rv.indexOf("E"); + + if (smallE != -1) { + String leftSide = rv.substring(0, smallE + 1); + String rightSide = rv.substring(smallE + 1); + rv = leftSide + exponentSign + rightSide; + + } + else if (bigE != -1) { + String leftSide = rv.substring(0, bigE + 1); + String rightSide = rv.substring(bigE + 1); + rv = leftSide + exponentSign + rightSide; + } + } + } + catch (Exception e) { + // return empty on fail + LOGGER.error(e.getMessage()); + rv = ""; + } + + return rv; + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/IplocationGeoIPDataMapper.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/IplocationGeoIPDataMapper.java index bcd430a..be3d961 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/IplocationGeoIPDataMapper.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/IplocationGeoIPDataMapper.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.iplocation; import com.maxmind.geoip2.DatabaseReader; @@ -72,6 +71,7 @@ * Requires either a GeoIP2-City or GeoIP2-Country (or GeoLite) MaxMind database to function. */ public class IplocationGeoIPDataMapper implements UDF3> { + private static final Logger LOGGER = LoggerFactory.getLogger(IplocationGeoIPDataMapper.class); private DatabaseReader reader; private final String path; @@ -100,7 +100,8 @@ public Map call(String ipString, String lang, Boolean allFields) InetAddress inetAddress; try { inetAddress = InetAddress.getByName(ipString); - } catch (UnknownHostException uhe) { + } + catch (UnknownHostException uhe) { LOGGER.warn("Unknown host exception: <{}>. Returning null result.", uhe); result = new HashMap<>(); result.put("lat", nullValue.value()); @@ -151,6 +152,7 @@ else if (dbType.equals("GeoLite2-Country") || dbType.equals("GeoIP2-Country")) { /** * Gets the location information for a CountryResponse + * * @param resp CountryResponse * @param lang Language, for example 'en', 'ja' or 'zh-CN'. * @return Map containing location data @@ -190,6 +192,7 @@ else if (resp.getCountry().getName() != null) { /** * Gets the location information for a CityResponse + * * @param resp CityResponse * @param lang Language, for example 'en', 'ja' or 'zh-CN' * @return Map of the location information @@ -283,7 +286,8 @@ else if (continent.getName() != null) { /** * Reads a file from HDFS and prepares an InputStream of it - * @param path HDFS (or local) file path + * + * @param path HDFS (or local) file path * @param hadoopConf Hadoop configuration item required for HDFS reading * @return Java IO InputStream of the file */ @@ -301,7 +305,8 @@ private InputStream initInputStream(String path, Configuration hadoopConf) { else { throw new RuntimeException("Invalid database file path given for iplocation command."); } - } catch (IOException e) { + } + catch (IOException e) { throw new RuntimeException(e); } @@ -310,6 +315,7 @@ private InputStream initInputStream(String path, Configuration hadoopConf) { /** * Assembles the Hadoop configuration object based on the key-value mapping of internal hadoop config map
+ * * @param hadoopCfgMap Map containing key-value pairs of hadoop configuration * @return Hadoop configuration object */ diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/IplocationRirDataMapper.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/IplocationRirDataMapper.java index fba6859..2a2311b 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/IplocationRirDataMapper.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/IplocationRirDataMapper.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.iplocation; import com.maxmind.db.Reader; @@ -65,11 +64,11 @@ /** * Maps each IP address string to location information.
- * Requires a rir-data MaxMind database to function. - * Expects the rir-data schema to contain "Country" and "Operator", however - * will skip any null values and return empty strings if encountered. + * Requires a rir-data MaxMind database to function. Expects the rir-data schema to contain "Country" and "Operator", + * however will skip any null values and return empty strings if encountered. */ public class IplocationRirDataMapper implements UDF3> { + private static final Logger LOGGER = LoggerFactory.getLogger(IplocationRirDataMapper.class); private boolean initialized = false; private final String path; @@ -97,8 +96,9 @@ public Map call(String ipString, String lang, Boolean allFields) InetAddress inetAddress; try { - inetAddress = InetAddress.getByName(ipString); - } catch (UnknownHostException uhe) { + inetAddress = InetAddress.getByName(ipString); + } + catch (UnknownHostException uhe) { LOGGER.warn("Unknown host exception: <{}>. Returning null result.", uhe); result.put("country", nullValue.value()); result.put("operator", nullValue.value()); @@ -132,7 +132,8 @@ public Map call(String ipString, String lang, Boolean allFields) /** * Reads a file from HDFS and prepares an InputStream of it - * @param path HDFS (or local) file path + * + * @param path HDFS (or local) file path * @param hadoopConf Hadoop configuration item required for HDFS reading * @return Java IO InputStream of the file */ @@ -150,7 +151,8 @@ private InputStream initInputStream(String path, Configuration hadoopConf) { else { throw new RuntimeException("Invalid database file path given for iplocation command."); } - } catch (IOException e) { + } + catch (IOException e) { throw new RuntimeException(e); } @@ -159,6 +161,7 @@ private InputStream initInputStream(String path, Configuration hadoopConf) { /** * Assembles the Hadoop configuration object based on the key-value mapping of internal hadoop config map
+ * * @param hadoopCfgMap Map containing key-value pairs of hadoop configuration * @return Hadoop configuration object */ diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/RirLookupResult.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/RirLookupResult.java index 87f316e..15da765 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/RirLookupResult.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/iplocation/RirLookupResult.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,21 +43,25 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.iplocation; import com.maxmind.db.MaxMindDbConstructor; import com.maxmind.db.MaxMindDbParameter; /** - * Used by the MaxMind database reader for looking up results and providing a way - * for the Java code to access the results for given IP address + * Used by the MaxMind database reader for looking up results and providing a way for the Java code to access the + * results for given IP address */ public class RirLookupResult { + private final String country; private final String operator; + @MaxMindDbConstructor - public RirLookupResult(@MaxMindDbParameter(name="Country") String country, @MaxMindDbParameter(name="Operator") String operator) { + public RirLookupResult( + @MaxMindDbParameter(name = "Country") String country, + @MaxMindDbParameter(name = "Operator") String operator + ) { this.country = country; this.operator = operator; } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/regex/RegexMatch.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/regex/RegexMatch.java index bd7aff6..381b2c6 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/regex/RegexMatch.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/regex/RegexMatch.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.regex; import com.teragrep.jpr_01.JavaPcre; @@ -58,9 +57,10 @@ public class RegexMatch implements UDF3 { /** * Filter rows that do not match the regex - * @param rowString row content + * + * @param rowString row content * @param regexString regex statement - * @param equals = is true, != is false + * @param equals = is true, != is false * @return boolean for where function * @throws Exception invalid args or pcre error */ @@ -93,6 +93,12 @@ else if (!isMatch && !equals) { return true; } - throw new RuntimeException(String.format("Invalid arguments used with regex command: row: '%s' regex: '%s' equals: '%s'", rowString, regexString, equals)); + throw new RuntimeException( + String + .format( + "Invalid arguments used with regex command: row: '%s' regex: '%s' equals: '%s'", + rowString, regexString, equals + ) + ); } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/replace/ReplaceCmd.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/replace/ReplaceCmd.java index ac5ef7b..634d764 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/replace/ReplaceCmd.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/replace/ReplaceCmd.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.replace; import org.apache.spark.sql.api.java.UDF3; @@ -56,10 +55,9 @@ /** * UDF used for the command replace.
- * Spark's built-in regex_replace can be used, but the wildcard replacement - * does not work using that function. Instead it must be done using this UDF. - * More information can be found from the 'struck' documentation. - *

+ * Spark's built-in regex_replace can be used, but the wildcard replacement does not work using that function. Instead + * it must be done using this UDF. More information can be found from the 'struck' documentation.
+ *
* Command examples:
* Original data: "data" *
@@ -67,11 +65,12 @@ *
* (2) REPLACE "dat*" WITH "y" -> result: "y" *
- * (3) REPLACE "dat*" WITH "y*" -> result: "ya" + * (3) REPLACE "dat*" WITH "y*" -> result: "ya" *
- * (4) REPLACE "*at*" WITH "*y*" -> result: "dya" + * (4) REPLACE "*at*" WITH "*y*" -> result: "dya" */ public class ReplaceCmd implements UDF3 { + private static final Logger LOGGER = LoggerFactory.getLogger(ReplaceCmd.class); @Override @@ -82,8 +81,14 @@ public String call(String currentContent, String wildcard, String replaceWith) t // Match converted regex with current content of the row Matcher matcher = Pattern.compile(regex).matcher(currentContent); - LOGGER.debug(String.format("[ReplaceCmd.call] Wildcard: %s | Regex: %s | CurrentContent: %s | WithClause: %s", - wildcard, regex, currentContent, replaceWith)); + LOGGER + .debug( + String + .format( + "[ReplaceCmd.call] Wildcard: %s | Regex: %s | CurrentContent: %s | WithClause: %s", + wildcard, regex, currentContent, replaceWith + ) + ); // Is there a match for replacing? boolean isMatch = matcher.matches(); @@ -101,41 +106,43 @@ public String call(String currentContent, String wildcard, String replaceWith) t } for (int i = 0; i < partsOfWildcard.length; i++) { - String contentWithinWildcard; - if (isFirst && partsOfWildcard.length > 1) { - if (partsOfWildcard[i].length() < 1) { - // Leading wildcard, without trailing present - contentWithinWildcard = currentContent.substring(0, currentContent.indexOf(partsOfWildcard[i+1])); - } - else { - // Leading wildcard, with trailing present - contentWithinWildcard = currentContent.substring(0, currentContent.indexOf(partsOfWildcard[i])); - } - isFirst = false; + String contentWithinWildcard; + if (isFirst && partsOfWildcard.length > 1) { + if (partsOfWildcard[i].length() < 1) { + // Leading wildcard, without trailing present + contentWithinWildcard = currentContent + .substring(0, currentContent.indexOf(partsOfWildcard[i + 1])); } else { - if (subSeq != null) { - // Trailing wildcard, with leading wildcard - contentWithinWildcard = currentContent.substring(currentContent.indexOf(partsOfWildcard[i]) + partsOfWildcard[i].length()); - } - else { - // Trailing wildcard, no leading wildcard - contentWithinWildcard = currentContent.substring(partsOfWildcard[i].length()); - } - + // Leading wildcard, with trailing present + contentWithinWildcard = currentContent.substring(0, currentContent.indexOf(partsOfWildcard[i])); } - - LOGGER.debug("The content within wildcard: <{}> ", contentWithinWildcard); - - if (subSeq == null) { - // First wildcard to be processed -> subsequence does not yet exist - subSeq = wildcardMatcher.replaceFirst(Matcher.quoteReplacement(contentWithinWildcard)); + isFirst = false; + } + else { + if (subSeq != null) { + // Trailing wildcard, with leading wildcard + contentWithinWildcard = currentContent + .substring(currentContent.indexOf(partsOfWildcard[i]) + partsOfWildcard[i].length()); } else { - // Subsequence exists, generate a new matcher for the subsequence and continue building it - wildcardMatcher = Pattern.compile("\\*").matcher(subSeq); - subSeq = wildcardMatcher.replaceFirst(Matcher.quoteReplacement(contentWithinWildcard)); + // Trailing wildcard, no leading wildcard + contentWithinWildcard = currentContent.substring(partsOfWildcard[i].length()); } + + } + + LOGGER.debug("The content within wildcard: <{}> ", contentWithinWildcard); + + if (subSeq == null) { + // First wildcard to be processed -> subsequence does not yet exist + subSeq = wildcardMatcher.replaceFirst(Matcher.quoteReplacement(contentWithinWildcard)); + } + else { + // Subsequence exists, generate a new matcher for the subsequence and continue building it + wildcardMatcher = Pattern.compile("\\*").matcher(subSeq); + subSeq = wildcardMatcher.replaceFirst(Matcher.quoteReplacement(contentWithinWildcard)); + } LOGGER.debug("Subsequent wildcard: <{}>", subSeq); } } @@ -158,14 +165,15 @@ public String call(String currentContent, String wildcard, String replaceWith) t } /** - * Converts a wildcard statement into a regex statement: - * all regex-sensitive characters are escaped, and the wildcard (*) gets - * converted into a regex any character wildcard (.*) + * Converts a wildcard statement into a regex statement: all regex-sensitive characters are escaped, and the + * wildcard (*) gets converted into a regex any character wildcard (.*) + * * @param wc wildcard statement string * @return regex statement string */ private String wcfieldToRegex(String wc) { - return wc.replaceAll("\\\\", "\\\\") + return wc + .replaceAll("\\\\", "\\\\") .replaceAll("\\^", "\\\\^") .replaceAll("\\.", "\\\\.") .replaceAll("\\|", "\\\\|") diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/CheckedSedString.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/CheckedSedString.java index c50defc..9cf961f 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/CheckedSedString.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/CheckedSedString.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -46,16 +46,17 @@ package com.teragrep.pth10.ast.commands.transformstatement.rex; /** - * Checked to be valid sed mode string. - * Checks validity on initialization. + * Checked to be valid sed mode string. Checks validity on initialization. */ public class CheckedSedString { + private String[] components; private boolean globalMode; private int replaceOccurrencesAmount; /** * Initialize a new instance of a checked sed string + * * @param uncheckedString string of a valid sed string format 's/.../.../g' */ public CheckedSedString(String uncheckedString) { @@ -66,6 +67,7 @@ public CheckedSedString(String uncheckedString) { /** * Get the regex to match in string format + * * @return regex string */ public String toRegexString() { @@ -74,6 +76,7 @@ public String toRegexString() { /** * Get the replacement string + * * @return replacement */ public String toReplacementString() { @@ -82,6 +85,7 @@ public String toReplacementString() { /** * How many occurrences of regex to replace + * * @return int, -1 means all */ public int replaceOccurrencesAmount() { @@ -90,6 +94,7 @@ public int replaceOccurrencesAmount() { /** * get if sed global mode is to be used + * * @return bool for global mode */ public boolean globalMode() { @@ -98,6 +103,7 @@ public boolean globalMode() { /** * Checks the string validity. Throws exceptions if errors are encountered. + * * @param sedStr string to check */ private void checkSedString(final String sedStr) { @@ -121,22 +127,25 @@ private void checkSedString(final String sedStr) { // should have four parts: sed mode, original string, replacement string and other flags. if (this.components.length != 4) { - throw new IllegalStateException("Invalid sed mode string was given: '" + sedStr + "', but expected " + - "s/original/replacement/[g|Ng|N], where n>0 and '/' is any delimiter of choice."); + throw new IllegalStateException( + "Invalid sed mode string was given: '" + sedStr + "', but expected " + + "s/original/replacement/[g|Ng|N], where n>0 and '/' is any delimiter of choice." + ); } - if (this.components[3].charAt(this.components[3].length()-1) == 'g' && this.components[3].length() == 1) { + if (this.components[3].charAt(this.components[3].length() - 1) == 'g' && this.components[3].length() == 1) { // global mode 'g' - globalMode=true; + globalMode = true; } - else if (this.components[3].charAt(this.components[3].length()-1) != 'g') { + else if (this.components[3].charAt(this.components[3].length() - 1) != 'g') { // replace occurrences mode 'N' replaceOccurrencesAmount = Integer.parseInt(this.components[3]); } else { // 'Ng' mode - globalMode=true; - replaceOccurrencesAmount = Integer.parseInt(this.components[3].substring(0, this.components[3].length()-1)); + globalMode = true; + replaceOccurrencesAmount = Integer + .parseInt(this.components[3].substring(0, this.components[3].length() - 1)); } } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/RexExtractModeUDF.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/RexExtractModeUDF.java index d394814..0104716 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/RexExtractModeUDF.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/RexExtractModeUDF.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -52,6 +52,7 @@ import java.util.Map; public class RexExtractModeUDF implements UDF2> { + @Override public Map call(String inputStr, String regexStr) throws Exception { // Create JavaPCRE instance diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/RexSedModeUDF.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/RexSedModeUDF.java index d6faee7..843b5ef 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/RexSedModeUDF.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex/RexSedModeUDF.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -52,6 +52,7 @@ import java.util.List; public class RexSedModeUDF implements UDF2 { + @Override public String call(String inputStr, String sedStr) throws Exception { JavaPcre pcre = new JavaPcre(); @@ -67,11 +68,12 @@ public String call(String inputStr, String sedStr) throws Exception { if (checkedSedString.globalMode()) { // global mode offsets = getAllOccurrences(pcre, inputStr, checkedSedString.replaceOccurrencesAmount()); - } else { + } + else { // replace nth occurrence mode offsets = getUpToNthOccurrence(pcre, inputStr, checkedSedString.replaceOccurrencesAmount()); // only need the last nth occurrence - offsets = offsets.subList(offsets.size()-1, offsets.size()); + offsets = offsets.subList(offsets.size() - 1, offsets.size()); } StringBuilder resultStrBuilder = new StringBuilder(); @@ -91,7 +93,7 @@ else if (j > beginInd && j < endInd) { // part-to-be-replaced, skip chars continue; } - else if (j == endInd && i == offsets.size()-1) { + else if (j == endInd && i == offsets.size() - 1) { // final set of offsets, add the remaining bits of the input resultStrBuilder.append(inputStr.charAt(j)); } @@ -123,7 +125,9 @@ private List getUpToNthOccurrence(JavaPcre jPcre, String input, int n) { offset = jPcre.get_ovector1(); //System.out.printf("Match found (%s, %s): '%s'\n", start, offset, input.substring(start, offset)); - offsets.add(new int[]{ start, offset }); + offsets.add(new int[] { + start, offset + }); } return offsets; @@ -141,7 +145,9 @@ private List getAllOccurrences(JavaPcre jPcre, String input, int from) { offset = jPcre.get_ovector1(); if (count >= from) { - offsets.add(new int[]{ start, offset }); + offsets.add(new int[] { + start, offset + }); } //System.out.printf("Match found (%s, %s): '%s'\n", start, offset, input.substring(start, offset)); jPcre.singlematch_java(input, offset); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex4j/NamedGroupsRex.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex4j/NamedGroupsRex.java index fa05b4c..d35c1a0 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex4j/NamedGroupsRex.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex4j/NamedGroupsRex.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.rex4j; import java.util.*; @@ -51,49 +50,49 @@ import java.util.regex.Pattern; /** - * Class used to get the capture groups for the rex4j command, - * which means which regex will match which new generated field.
- * + * Class used to get the capture groups for the rex4j command, which means which regex will match which new generated + * field.
* Syntax example:
* {@literal .*latitude\":\s(?-?\d+.\d+)}
- * Would return anything matching ' latitude": 0.0 ' as a new field - * latiTUDE with the 0.0 being the contents of that new generated field. - * + * Would return anything matching ' latitude": 0.0 ' as a new field latiTUDE with the 0.0 being the contents of that new + * generated field. */ public class NamedGroupsRex { /** * Gets multiple new groups + * * @param regex Regex and group * @return map with group and group index */ - public static Map getNamedGroups(String regex) { + public static Map getNamedGroups(String regex) { List namedGroups = new ArrayList<>(); - Map rv = new LinkedHashMap<>(); - HashMap offsets = new HashMap<>(); + Map rv = new LinkedHashMap<>(); + HashMap offsets = new HashMap<>(); Matcher m = Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>").matcher(regex); int ind = 1; while (m.find()) { namedGroups.add(m.group(1)); -// LOGGER.info(m.group()+" groupCount"+(ind)); - rv.put(m.group(1),ind++); + // LOGGER.info(m.group()+" groupCount"+(ind)); + rv.put(m.group(1), ind++); } return rv; } /** * Gets a single group + * * @param regex Regex and group * @return group */ public static String getNamedGroup(String regex) { - Matcher m = Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>").matcher(regex); - - if (m.find()) { - return m.group(1); - } - - return null; + Matcher m = Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>").matcher(regex); + + if (m.find()) { + return m.group(1); + } + + return null; } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex4j/Rex4jTransformation.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex4j/Rex4jTransformation.java index c76fd0c..0b3defa 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex4j/Rex4jTransformation.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/rex4j/Rex4jTransformation.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.rex4j; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -60,30 +59,26 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.List; - /** * Class that contains the necessary implemented visitor functions for the rex4j command.
- * - * Rex4j provides a way to extract data from fields and generate new fields based on the extracted - * data.
- * - * Rex4j also has a replace mode (mode=sed) that can use sed-based syntax to replace - * values in the given field. If no field is specified, field is set to "_raw" by default. + * Rex4j provides a way to extract data from fields and generate new fields based on the extracted data.
+ * Rex4j also has a replace mode (mode=sed) that can use sed-based syntax to replace values in the given field. If no + * field is specified, field is set to "_raw" by default. */ public class Rex4jTransformation extends DPLParserBaseVisitor { + private static final Logger LOGGER = LoggerFactory.getLogger(Rex4jTransformation.class); public Rex4jStep rex4jStep = null; private final DPLParserCatalystContext catCtx; - public Rex4jTransformation(DPLParserCatalystContext catCtx) - { + public Rex4jTransformation(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; } /** * Main visitor function, from where the rest of the parse tree for this command will be walked + * * @param ctx Rex4jTransformationContext * @return StepNode containing Step for rex4j command */ @@ -99,7 +94,7 @@ public Node rexTransformationEmitCatalyst(DPLParser.Rex4jTransformationContext c String field = "_raw"; // The field that you want to extract information from. // Optional fieldname, default is _raw - if(ctx.t_rex4j_fieldParameter()!= null) { + if (ctx.t_rex4j_fieldParameter() != null) { field = visit(ctx.t_rex4j_fieldParameter()).toString(); } @@ -127,28 +122,34 @@ public Node rexTransformationEmitCatalyst(DPLParser.Rex4jTransformationContext c return new StepNode(rex4jStep); } - @Override public Node visitT_rex4j_fieldParameter(DPLParser.T_rex4j_fieldParameterContext ctx) { + @Override + public Node visitT_rex4j_fieldParameter(DPLParser.T_rex4j_fieldParameterContext ctx) { String s = ctx.getChild(1).getText(); s = new UnquotedText(new TextString(s)).read(); - StringNode rv = new StringNode(new Token(Type.STRING, s)); + StringNode rv = new StringNode(new Token(Type.STRING, s)); return rv; } - @Override public Node visitT_rex4j_maxMatchParameter(DPLParser.T_rex4j_maxMatchParameterContext ctx) { + @Override + public Node visitT_rex4j_maxMatchParameter(DPLParser.T_rex4j_maxMatchParameterContext ctx) { String s = ctx.getChild(1).getText(); s = new UnquotedText(new TextString(s)).read(); - StringNode rv = new StringNode(new Token(Type.STRING,s)); - LOGGER.info("visitT_rex4j_maxMatchParameter: return=<{}>" , rv); + StringNode rv = new StringNode(new Token(Type.STRING, s)); + LOGGER.info("visitT_rex4j_maxMatchParameter: return=<{}>", rv); return rv; } - @Override public Node visitT_rex4j_modeSedParameter(DPLParser.T_rex4j_modeSedParameterContext ctx) { + + @Override + public Node visitT_rex4j_modeSedParameter(DPLParser.T_rex4j_modeSedParameterContext ctx) { TerminalNode sedMode = (TerminalNode) ctx.getChild(1); //DPLLexer.COMMAND_REX4J_MODE_REGEXP_REPLACE - return new StringNode(new Token(Type.STRING, sedMode.getSymbol().toString())); + return new StringNode(new Token(Type.STRING, sedMode.getSymbol().toString())); } - @Override public Node visitT_rex4j_offsetFieldParameter(DPLParser.T_rex4j_offsetFieldParameterContext ctx) { + + @Override + public Node visitT_rex4j_offsetFieldParameter(DPLParser.T_rex4j_offsetFieldParameterContext ctx) { throw new RuntimeException("rex4j_offsetFieldParameter not supported yet"); -// return visitChildren(ctx); + // return visitChildren(ctx); } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/sendemail/DatasetToTextBuilder.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/sendemail/DatasetToTextBuilder.java index 909f893..86fa895 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/sendemail/DatasetToTextBuilder.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/sendemail/DatasetToTextBuilder.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.sendemail; import org.apache.spark.sql.Dataset; @@ -57,191 +56,192 @@ /** * Converts Dataset to text, such as an HTML table, CSV or raw. + * * @author eemhu - * */ public class DatasetToTextBuilder implements Serializable { - - private static final long serialVersionUID = 1L; - private String format = "table"; - private String lineBreak = "\n"; - - /** - * empty constructor will use default 'table' format and '\n' line break - */ - public DatasetToTextBuilder() { } - - /** - * Constructor that provides format and linebreak character - * @param format html, csv or raw - * @param lineBreak like \n - */ - public DatasetToTextBuilder(String format, String lineBreak) - { - this.format = format; - this.lineBreak = lineBreak; - } - - /** - * Collects dataset to driver and builds the text content using the - * {@link #build(List)} method - * @param dset Dataset to build the content from - * @return text content - */ - public String build(Dataset dset) { - List listOfRows = dset.collectAsList(); // to driver, don't use for high count datasets - return this.build(listOfRows); - } - - /** - * Build string with given format
- * "table" format can still use hard-coded \n line break since it is just html code - * @param listOfRows list of dataset rows - * @return text content - */ - public String build(List listOfRows) { - if (listOfRows.size() == 0 && format.equals("csv")) { - return "no_results" + this.lineBreak + "Search results had no rows."; - } - else if (listOfRows.size() == 0 && format.equals("table")) { - return "\n\nSearch results had no rows.\n\n"; - } - else if (listOfRows.size() == 0 && format.equals("raw")) { - return "Search results had no rows."; - } - - StructType schema = listOfRows.get(0).schema(); - int numOfCols = schema.length(); - - if (format.equals("table")) { - // includes styling to make every other row grey-ish - String html = "\n\n"; - - // column headers - html = html.concat("\n"); - - Iterator it = schema.iterator(); - while (it.hasNext()) { - StructField col = it.next(); - String colName = col.name(); - - html = html.concat("\n"); - } - - html = html.concat("\n"); - - // actual rows - for (Row r : listOfRows) { - html = html.concat("\n"); - - for (int i = 0; i < numOfCols; i++) { - html = html.concat("\n"); - } - - html = html.concat("\n"); - } - - html = html.concat("
"); - html = html.concat(colName); - html = html.concat("
"); - - // check for null - Object cell = r.getAs(i); - if (cell != null) { - html = html.concat(cell.toString()); - } - else { - html = html.concat("null"); - } - - html = html.concat("
"); - - return html; - } - else if (format.equals("csv")) { - String csv = ""; - String cols = ""; - - // Go through columns - Iterator it = schema.iterator(); - boolean first = true; - while (it.hasNext()) { - StructField col = it.next(); - String colName = col.name(); - - if (first) { - cols = cols.concat(colName); - first = false; - } - else { - cols = cols.concat(","); - cols = cols.concat(colName); - } - } - - // add column headers - csv = csv.concat(cols); - csv = csv.concat(this.lineBreak); - - // Go through rows - for (Row r : listOfRows) { - String rowString = ""; - boolean firstInRow = true; - - // Go through cells in each row - // Enclose cells in double quotes - for (int i = 0; i < r.length(); i++) { - String cell = r.getAs(i).toString(); - cell = cell.replaceAll("\"", "\"\""); // in-cell double quotes need to be duplicated - - if (firstInRow) { - cell = "\"" + cell + "\""; - firstInRow = false; - } - else { - cell = ",\"" + cell + "\""; - } - rowString = rowString.concat(cell); - } - csv = csv.concat(rowString); - csv = csv.concat(this.lineBreak); - } - - return csv; - } - else if (format.equals("raw")) { - String raw = ""; - String cols = ""; - - Iterator it = schema.iterator(); - boolean first = true; - while (it.hasNext()) { - StructField col = it.next(); - String colName = col.name(); - - if (first) { - cols = cols.concat(colName); - first = false; - } - else { - cols = cols.concat(this.lineBreak); - cols = cols.concat(colName); - } - } - - // column headers - raw = raw.concat(cols); // substring to remove [ ] from string - raw = raw.concat(this.lineBreak); - // rows - for (Row r : listOfRows) { - raw = raw.concat(r.mkString(this.lineBreak)); - raw = raw.concat(this.lineBreak); - } - - - return raw; - } - - throw new IllegalArgumentException("Invalid inline data format '" + format + "' !"); - } + + private static final long serialVersionUID = 1L; + private String format = "table"; + private String lineBreak = "\n"; + + /** + * empty constructor will use default 'table' format and '\n' line break + */ + public DatasetToTextBuilder() { + } + + /** + * Constructor that provides format and linebreak character + * + * @param format html, csv or raw + * @param lineBreak like \n + */ + public DatasetToTextBuilder(String format, String lineBreak) { + this.format = format; + this.lineBreak = lineBreak; + } + + /** + * Collects dataset to driver and builds the text content using the {@link #build(List)} method + * + * @param dset Dataset to build the content from + * @return text content + */ + public String build(Dataset dset) { + List listOfRows = dset.collectAsList(); // to driver, don't use for high count datasets + return this.build(listOfRows); + } + + /** + * Build string with given format
+ * "table" format can still use hard-coded \n line break since it is just html code + * + * @param listOfRows list of dataset rows + * @return text content + */ + public String build(List listOfRows) { + if (listOfRows.size() == 0 && format.equals("csv")) { + return "no_results" + this.lineBreak + "Search results had no rows."; + } + else if (listOfRows.size() == 0 && format.equals("table")) { + return "\n\nSearch results had no rows.\n\n"; + } + else if (listOfRows.size() == 0 && format.equals("raw")) { + return "Search results had no rows."; + } + + StructType schema = listOfRows.get(0).schema(); + int numOfCols = schema.length(); + + if (format.equals("table")) { + // includes styling to make every other row grey-ish + String html = "\n\n"; + + // column headers + html = html.concat("\n"); + + Iterator it = schema.iterator(); + while (it.hasNext()) { + StructField col = it.next(); + String colName = col.name(); + + html = html.concat("\n"); + } + + html = html.concat("\n"); + + // actual rows + for (Row r : listOfRows) { + html = html.concat("\n"); + + for (int i = 0; i < numOfCols; i++) { + html = html.concat("\n"); + } + + html = html.concat("\n"); + } + + html = html.concat("
"); + html = html.concat(colName); + html = html.concat("
"); + + // check for null + Object cell = r.getAs(i); + if (cell != null) { + html = html.concat(cell.toString()); + } + else { + html = html.concat("null"); + } + + html = html.concat("
"); + + return html; + } + else if (format.equals("csv")) { + String csv = ""; + String cols = ""; + + // Go through columns + Iterator it = schema.iterator(); + boolean first = true; + while (it.hasNext()) { + StructField col = it.next(); + String colName = col.name(); + + if (first) { + cols = cols.concat(colName); + first = false; + } + else { + cols = cols.concat(","); + cols = cols.concat(colName); + } + } + + // add column headers + csv = csv.concat(cols); + csv = csv.concat(this.lineBreak); + + // Go through rows + for (Row r : listOfRows) { + String rowString = ""; + boolean firstInRow = true; + + // Go through cells in each row + // Enclose cells in double quotes + for (int i = 0; i < r.length(); i++) { + String cell = r.getAs(i).toString(); + cell = cell.replaceAll("\"", "\"\""); // in-cell double quotes need to be duplicated + + if (firstInRow) { + cell = "\"" + cell + "\""; + firstInRow = false; + } + else { + cell = ",\"" + cell + "\""; + } + rowString = rowString.concat(cell); + } + csv = csv.concat(rowString); + csv = csv.concat(this.lineBreak); + } + + return csv; + } + else if (format.equals("raw")) { + String raw = ""; + String cols = ""; + + Iterator it = schema.iterator(); + boolean first = true; + while (it.hasNext()) { + StructField col = it.next(); + String colName = col.name(); + + if (first) { + cols = cols.concat(colName); + first = false; + } + else { + cols = cols.concat(this.lineBreak); + cols = cols.concat(colName); + } + } + + // column headers + raw = raw.concat(cols); // substring to remove [ ] from string + raw = raw.concat(this.lineBreak); + // rows + for (Row r : listOfRows) { + raw = raw.concat(r.mkString(this.lineBreak)); + raw = raw.concat(this.lineBreak); + } + + return raw; + } + + throw new IllegalArgumentException("Invalid inline data format '" + format + "' !"); + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/sendemail/SendemailResultsProcessor.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/sendemail/SendemailResultsProcessor.java index ba50d1f..c9b34e5 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/sendemail/SendemailResultsProcessor.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/sendemail/SendemailResultsProcessor.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.sendemail; import be.quodlibet.boxable.BaseTable; @@ -67,522 +66,566 @@ import java.util.*; /** - * A class that processes a list of rows into an email. - * Used for the sendemail command. + * A class that processes a list of rows into an email. Used for the sendemail command. */ public class SendemailResultsProcessor implements Serializable { - private static final Logger LOGGER = LoggerFactory.getLogger(SendemailResultsProcessor.class); - - private static final long serialVersionUID = 1L; - private static List listOfRows = new ArrayList<>(); - private static int maxCount = 50000; - private static int count = 0; - - // from constructor - private boolean use_tls = false; - private String server = "localhost"; - private int port = 25; - private boolean use_ssl = false; - //DPLParserCatalystContext catCtx = null; - private String fromEmail = null; - private String toEmails = null; - private String ccEmails = null; - private String bccEmails = null; - private String subject = null; - private String customMessageContent = null; - - private String format = "csv"; - private boolean sendResults = false; - private boolean inline = false; - private boolean sendCsv = false; - private boolean sendPdf = false; - private String customFooterContent = null; - private String paperSize = null; - private String paperOrientation = null; - private String content_type = null; - - private static PDPage page1 = new PDPage(); - private static PDDocument pdfDoc = new PDDocument(); - private static MimeBodyPart dataBodyPart = new MimeBodyPart(); - private static MimeBodyPart attachmentBodyPart = new MimeBodyPart(); - - private static String username = ""; - private static String password = ""; - - private String urlToParagraph = null; - - private boolean smtpDebug = false; // enable debugging for email sending - - // Was the email processor called already? (make sure that email is only sent once if sendresults=false) - private boolean isCalledBefore = false; - - /** - * Construct the SendemailResultsProcessor with given parameters. - * - * @param use_tls Use TLS or not - * @param server server host name (e.g. localhost) - * @param port server port number (e.g. 25) - * @param use_ssl Use SSL or not - * @param username SMTP username - * @param password SMTP password - * @param fromEmail Send from email - * @param toEmails Send to emails (separate with comma) - * @param ccEmails cc emails (separate with comma) - * @param bccEmails bcc emails (separate with comma) - * @param subject custom subject if needed - * @param customMessageContent custom message if needed - * @param format format of inline results (csv, raw, table) - * @param sendResults whether or not include results in email - * @param inline whether or not include inline results in email - * @param sendCsv send csv attachment? - * @param sendPdf send pdf attachment? - * @param customFooterContent custom footer if needed - * @param paperSize custom paperSize if needed (e.g. "a4") - * @param paperOrientation custom paper orientation (e.g. "landscape" or "portrait") - * @param content_type "plain" text or "html" - * @param maxInputs maximum batch size to send at a time - * @param smtpDebug enable additional SMTP debug logging - * @param urlToParagraph URL address to paragraph containing the results - */ - public SendemailResultsProcessor(boolean use_tls, String server, int port, boolean use_ssl, String username, String password, String fromEmail, String toEmails, String ccEmails, String bccEmails, String subject, String customMessageContent, - String format, boolean sendResults, boolean inline, boolean sendCsv, boolean sendPdf, String customFooterContent, String paperSize, String paperOrientation, String content_type, int maxInputs, String urlToParagraph, boolean smtpDebug) { - super(); - - this.use_tls = use_tls; - this.server = server; - this.port = port; - this.use_ssl = use_ssl; - - SendemailResultsProcessor.username = username; - SendemailResultsProcessor.password = password; - - this.fromEmail = fromEmail; - this.toEmails = toEmails; - this.ccEmails = ccEmails; - this.bccEmails = bccEmails; - this.subject = subject; - this.customMessageContent = customMessageContent; - - this.format = format; - this.sendResults = sendResults; - this.inline = inline; - this.sendCsv = sendCsv; - this.sendPdf = sendPdf; - this.customFooterContent = customFooterContent; - this.paperSize = paperSize; - this.paperOrientation = paperOrientation; - this.content_type = content_type; - - SendemailResultsProcessor.maxCount = maxInputs; // set to other than 50k - - this.urlToParagraph = urlToParagraph; - this.smtpDebug = smtpDebug; - } - - /** - * Used for testing translation; not to be used outside of testing and/or debugging. - * Returns all parameters in a Map - * @return map of parameters, where key= variable name, value= the value of that variable - */ - @VisibleForTesting - public Map getParameters() { - Map params = new HashMap<>(); - - params.put("use_tls", String.valueOf(use_tls)); - params.put("server", server); - params.put("port", String.valueOf(port)); - params.put("use_ssl", String.valueOf(use_ssl)); - params.put("username", username); - params.put("password", password != null ? "***" : "NULL"); - params.put("fromEmail", fromEmail); - params.put("toEmails", toEmails); - params.put("bccEmails", bccEmails); - params.put("ccEmails", ccEmails); - params.put("subject", subject); - params.put("customMessageContent", customMessageContent); - params.put("format", format); - params.put("sendResults", String.valueOf(sendResults)); - params.put("inline", String.valueOf(inline)); - params.put("sendCsv", String.valueOf(sendCsv)); - params.put("sendPdf", String.valueOf(sendPdf)); - params.put("customFooterContent", customFooterContent); - params.put("paperSize", paperSize); - params.put("paperOrientation", paperOrientation); - params.put("content_type", content_type); - params.put("maxinputs", String.valueOf(maxCount)); - params.put("urlToParagraph", urlToParagraph); - params.put("smtpDebug", String.valueOf(smtpDebug)); - - return params; - } - - /** - * Has the call() function of this object been used even once? - * @return true or false - */ - public boolean getIsCalledBefore() { - return this.isCalledBefore; - } - - /** - * Builds email on maxInputs batches using a list of rows given - * @param rows List of rows - * @throws Exception Any exception that occurred during building and sending the email - */ - public void call(List rows) throws Exception { - this.isCalledBefore = true; - // Size of list of rows (collected dataframe) - int size = rows.size(); - // rows needed for current maxInputs email batch - int remainingToBeAdded = maxCount - count; - - // none remaining, build email - if (remainingToBeAdded == 0) { - buildEmail(listOfRows); - listOfRows.clear(); - count = 0; - } - // Needs more than currently in list of rows - else if (remainingToBeAdded > size) { - listOfRows.addAll(rows); - count += size; - } - // Current email batch needs less than in list of rows, - // call this function again with surplus rows - else { // remaining != 0, remaining < size - listOfRows.addAll(rows.subList(0, remainingToBeAdded)); - buildEmail(listOfRows); - listOfRows.clear(); - count = 0; - - List toAdd = rows.subList(remainingToBeAdded, rows.size()); - this.call(toAdd); - } - } - - /** - * Send email without rows (sendresults=false) - * @throws Exception Any exception that occurred during building and sending the email - */ - public void call() throws Exception { - this.isCalledBefore = true; - buildEmail(null); - } - - /** - * flushes the remaining rows that were not processed as they did not reach the target count - * @throws Exception Any error that occurred during the flush() - */ - public void flush() throws Exception { - LOGGER.info("Flushing email processor!"); - if (listOfRows != null && listOfRows.size() > 0) { - buildEmail(listOfRows); - listOfRows.clear(); - count = 0; - } - } - - /** - * Builds the email from current content of listOfRows - * @throws MessagingException - */ - private void buildEmail(List listOfRows) throws MessagingException { - LOGGER.info("Building email!"); - - // Properties instance used for setting up email parameters - Properties emailProp = new Properties(); - emailProp.put("mail.smtp.auth", use_ssl || use_tls); // use auth if either ssl or tls is used - emailProp.put("mail.smtp.starttls.enable", use_tls); - emailProp.put("mail.smtp.host", server); - emailProp.put("mail.smtp.port", port); - emailProp.put("mail.smtp.ssl.enable", use_ssl); - - LOGGER.info("Sendemail properties: <{}>", emailProp.entrySet()); - - // user and pass from zeppelin config - final Session session = Session.getInstance(emailProp, new Authenticator() { - @Override - protected PasswordAuthentication getPasswordAuthentication() { - return new PasswordAuthentication(username, password); - } - }); - - // Set debugging, if given in zeppelin config - session.setDebug(this.smtpDebug); - - // Start building the email message - try { - final Message message = new MimeMessage(session); - - // from, to, cc and bcc emails - if (fromEmail != null) { - message.setFrom(new InternetAddress(fromEmail)); - } - - if (toEmails != null) { - message.setRecipients(Message.RecipientType.TO, InternetAddress.parse(toEmails)); - } - else { - throw new IllegalArgumentException("Sendemail command did not contain a required parameter: 'to=' !"); - } - - - if (ccEmails != null) { - message.setRecipients(Message.RecipientType.CC, InternetAddress.parse(ccEmails)); - } - - if (bccEmails != null) { - message.setRecipients(Message.RecipientType.BCC, InternetAddress.parse(bccEmails)); - } - - // set subject if given in command, otherwise defaults to "Teragrep Results" - message.setSubject(subject != null ? subject : "Teragrep Results"); - - // line break based on content_type - String lineBreak = "\n"; - if (this.content_type.equals("html")) { - lineBreak = "
"; - } - - // Message depends on given parameters - // sendResults=false -> Search complete - // sendResults=true, inline=true, and no attachments -> Search results. - // sendResults=true with attachments -> Search results attached. - // - // Apply custom message if it was given in the command - String messageContent = ""; - if (customMessageContent == null) { - if (!sendResults) { - messageContent = "Search complete."; - } - else if (sendResults && inline && (!sendPdf || !sendCsv)) { - messageContent = "Search results."; - } - else if (sendResults && (sendPdf || sendCsv)) { - messageContent = "Search results attached."; - } - } - else { - messageContent = customMessageContent; - } - - // Add url to paragraph - if (this.urlToParagraph != null) { - if (this.content_type.equals("html")) { - messageContent += String.format("
View results in Teragrep", this.urlToParagraph); - } - else { - // plain - messageContent += String.format("\n%s", this.urlToParagraph); - } - - } - - // Footer (again, custom footer will be applied if it was given in the command) - MimeBodyPart footerBodyPart = new MimeBodyPart(); - String footerContent = "This email was generated via the sendemail command. Not the correct recipient? Contact your Teragrep administrator." + lineBreak + "Teragrep - Know Everything"; - if (customFooterContent != null) { - footerContent = customFooterContent; - } - footerBodyPart.setContent(footerContent, content_type.equals("html") ? "text/html; charset=utf-8" : "text/plain; charset=utf-8"); - - // Full email will be assembled to MimeMultipart - final Multipart multipart = new MimeMultipart(); - - // Set messageContent to MimeBodyPart - MimeBodyPart messageBodyPart = new MimeBodyPart(); - messageBodyPart.setContent(messageContent, content_type.equals("html") ? "text/html; charset=utf-8" : "text/plain; charset=utf-8"); - - // Add message bodypart to MimeMultipart - multipart.addBodyPart(messageBodyPart); - - // pdf paper settings - // Object hierarchy for pdf document - // pdfDoc (main document) <- page1 (page) <- table (outside table) <- t (internal data) - pdfDoc = new PDDocument(); - page1 = null; - - // set paper size - float paperWidth = 0f, paperHeight = 0f; - - switch (paperSize) { - case "legal": - paperWidth = PDRectangle.LEGAL.getWidth(); - paperHeight = PDRectangle.LEGAL.getHeight(); - break; - case "a2": - paperWidth = PDRectangle.A2.getWidth(); - paperHeight = PDRectangle.A2.getHeight(); - break; - case "a3": - case "ledger": - // ledger is equivalent to A3 - paperWidth = PDRectangle.A3.getWidth(); - paperHeight = PDRectangle.A3.getHeight(); - break; - case "a4": - paperWidth = PDRectangle.A4.getWidth(); - paperHeight = PDRectangle.A4.getHeight(); - break; - case "a5": - paperWidth = PDRectangle.A5.getWidth(); - paperHeight = PDRectangle.A5.getHeight(); - break; - case "letter": - default: - // letter is default - paperWidth = PDRectangle.LETTER.getWidth(); - paperHeight = PDRectangle.LETTER.getHeight(); - break; - } - - // Create page with set paper orientation and size - if (paperOrientation.equals("landscape")) { - page1 = new PDPage(new PDRectangle(paperHeight, paperWidth)); - } - else { - // default (portrait) - page1 = new PDPage(new PDRectangle(paperWidth, paperHeight)); - } - - String dataContent = null; - - // Get search results if sendResults = true - dataBodyPart = new MimeBodyPart(); - - // DatasetToTextBuilder init - DatasetToTextBuilder txtBuilder = new DatasetToTextBuilder(format, lineBreak); - if (sendResults && inline) { - dataContent = txtBuilder.build(listOfRows); - // set dataContent to dataBodyPart - dataBodyPart.setContent(dataContent, content_type.equals("html") ? "text/html; charset=utf-8" : "text/plain; charset=utf-8"); - } - - // sendCsv and sendPdf - if (sendResults && sendCsv) { - if (!inline || format != "csv" || !lineBreak.equals("\n")) { - // dataContent is something other than csv - // re-generate csv version - txtBuilder = new DatasetToTextBuilder("csv", "\n"); - dataContent = txtBuilder.build(listOfRows); - } - - byte[] fileBase64ByteArray; - try { - fileBase64ByteArray = java.util.Base64.getEncoder().encode(dataContent.getBytes("UTF-8")); - // headers for attachment - InternetHeaders fileHeaders = new InternetHeaders(); - fileHeaders.setHeader("Content-Type", "text/csv; name=\"results.csv\""); - fileHeaders.setHeader("Content-Transfer-Encoding", "base64"); - fileHeaders.setHeader("Content-Disposition", "attachment; filename=\"results.csv\""); - // body part for attachment - attachmentBodyPart = new MimeBodyPart(fileHeaders, fileBase64ByteArray); - attachmentBodyPart.setFileName("results.csv"); - - multipart.addBodyPart(attachmentBodyPart); // attachment - - } - catch (UnsupportedEncodingException e) { - e.printStackTrace(); - } - } - else if (sendResults && sendPdf) { - if (!inline || format != "csv" || !lineBreak.equals("\n")) { - // dataContent is something other than csv - // re-generate csv version - // csv is used to generate table to pdf - txtBuilder = new DatasetToTextBuilder("csv", "\n"); - dataContent = txtBuilder.build(listOfRows); - } - - // Generate content - try { - // add page to main document and initialize contentStream - pdfDoc.addPage(page1); - PDPageContentStream contentStream = new PDPageContentStream(pdfDoc, page1); - - // Setup contentStream for text - contentStream.beginText(); - contentStream.setFont(PDType1Font.TIMES_ROMAN, 16); - contentStream.setLeading(14.5f); - - // table settings - float margin = 50; - float yStartNewPage = page1.getMediaBox().getHeight() - (2 * margin); - float tableWidth = page1.getMediaBox().getWidth() - (2 * margin); - boolean drawContent = true; - float yStart = yStartNewPage; - float bottomMargin = 70; - float yPosition = page1.getMediaBox().getHeight() - 100; - - // Generate BaseTable and DataTable - BaseTable table = new BaseTable(yPosition, yStartNewPage, bottomMargin, tableWidth, margin, pdfDoc, page1, true, drawContent); - DataTable t = new DataTable(table, page1); - t.addCsvToTable(dataContent, DataTable.HASHEADER, ','); - - // write text to contentStream (title) - contentStream.newLineAtOffset(25, page1.getMediaBox().getHeight() - 50); - contentStream.showText("Teragrep Results"); - contentStream.endText(); - - // draw table and close stream - table.draw(); - contentStream.close(); - - // output pdf to stream - ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - - //pdfDoc.save(new File("/tmp/pth10_" + UUID.randomUUID() + ".pdf")); // comment out - file system save for debugging - pdfDoc.save(outputStream); - pdfDoc.close(); - - // stream to base64 and attach to email with given headers - byte[] fileBase64ByteArray = java.util.Base64.getEncoder().encode(outputStream.toByteArray()); - - // headers for attachment - InternetHeaders fileHeaders = new InternetHeaders(); - fileHeaders.setHeader("Content-Type", "application/pdf; name=\"results.pdf\""); - fileHeaders.setHeader("Content-Transfer-Encoding", "base64"); - fileHeaders.setHeader("Content-Disposition", "attachment; filename=\"results.pdf\""); - - // body part for attachment - MimeBodyPart attachmentBodyPart = new MimeBodyPart(fileHeaders, fileBase64ByteArray); - attachmentBodyPart.setFileName("results.pdf"); - - multipart.addBodyPart(attachmentBodyPart); // attachment - - } catch (IOException e) { - LOGGER.error("sendPdf IOException: <{}>", e.getMessage()); - e.printStackTrace(); - } - - - - } - - // add inline results to multipart if inline=true - if (sendResults && inline) { - if (dataBodyPart != null) { - multipart.addBodyPart(dataBodyPart); - } - } - - - // Set multipart as the content for message and send - multipart.addBodyPart(footerBodyPart); // add footer - message.setContent(multipart); - Transport.send(message); - - } - catch (MessagingException me) { - LOGGER.error("An error occurred trying to send email using the sendemail command. Details:"); - me.printStackTrace(); - - // FIXME: Implement: Throw an exception if not in graceful mode - //if (!graceful) { - throw new RuntimeException("Error sending email using sendemail command! Details: " + me.getMessage()); - //} - } - } + + private static final Logger LOGGER = LoggerFactory.getLogger(SendemailResultsProcessor.class); + + private static final long serialVersionUID = 1L; + private static List listOfRows = new ArrayList<>(); + private static int maxCount = 50000; + private static int count = 0; + + // from constructor + private boolean use_tls = false; + private String server = "localhost"; + private int port = 25; + private boolean use_ssl = false; + //DPLParserCatalystContext catCtx = null; + private String fromEmail = null; + private String toEmails = null; + private String ccEmails = null; + private String bccEmails = null; + private String subject = null; + private String customMessageContent = null; + + private String format = "csv"; + private boolean sendResults = false; + private boolean inline = false; + private boolean sendCsv = false; + private boolean sendPdf = false; + private String customFooterContent = null; + private String paperSize = null; + private String paperOrientation = null; + private String content_type = null; + + private static PDPage page1 = new PDPage(); + private static PDDocument pdfDoc = new PDDocument(); + private static MimeBodyPart dataBodyPart = new MimeBodyPart(); + private static MimeBodyPart attachmentBodyPart = new MimeBodyPart(); + + private static String username = ""; + private static String password = ""; + + private String urlToParagraph = null; + + private boolean smtpDebug = false; // enable debugging for email sending + + // Was the email processor called already? (make sure that email is only sent once if sendresults=false) + private boolean isCalledBefore = false; + + /** + * Construct the SendemailResultsProcessor with given parameters. + * + * @param use_tls Use TLS or not + * @param server server host name (e.g. localhost) + * @param port server port number (e.g. 25) + * @param use_ssl Use SSL or not + * @param username SMTP username + * @param password SMTP password + * @param fromEmail Send from email + * @param toEmails Send to emails (separate with comma) + * @param ccEmails cc emails (separate with comma) + * @param bccEmails bcc emails (separate with comma) + * @param subject custom subject if needed + * @param customMessageContent custom message if needed + * @param format format of inline results (csv, raw, table) + * @param sendResults whether or not include results in email + * @param inline whether or not include inline results in email + * @param sendCsv send csv attachment? + * @param sendPdf send pdf attachment? + * @param customFooterContent custom footer if needed + * @param paperSize custom paperSize if needed (e.g. "a4") + * @param paperOrientation custom paper orientation (e.g. "landscape" or "portrait") + * @param content_type "plain" text or "html" + * @param maxInputs maximum batch size to send at a time + * @param smtpDebug enable additional SMTP debug logging + * @param urlToParagraph URL address to paragraph containing the results + */ + public SendemailResultsProcessor( + boolean use_tls, + String server, + int port, + boolean use_ssl, + String username, + String password, + String fromEmail, + String toEmails, + String ccEmails, + String bccEmails, + String subject, + String customMessageContent, + String format, + boolean sendResults, + boolean inline, + boolean sendCsv, + boolean sendPdf, + String customFooterContent, + String paperSize, + String paperOrientation, + String content_type, + int maxInputs, + String urlToParagraph, + boolean smtpDebug + ) { + super(); + + this.use_tls = use_tls; + this.server = server; + this.port = port; + this.use_ssl = use_ssl; + + SendemailResultsProcessor.username = username; + SendemailResultsProcessor.password = password; + + this.fromEmail = fromEmail; + this.toEmails = toEmails; + this.ccEmails = ccEmails; + this.bccEmails = bccEmails; + this.subject = subject; + this.customMessageContent = customMessageContent; + + this.format = format; + this.sendResults = sendResults; + this.inline = inline; + this.sendCsv = sendCsv; + this.sendPdf = sendPdf; + this.customFooterContent = customFooterContent; + this.paperSize = paperSize; + this.paperOrientation = paperOrientation; + this.content_type = content_type; + + SendemailResultsProcessor.maxCount = maxInputs; // set to other than 50k + + this.urlToParagraph = urlToParagraph; + this.smtpDebug = smtpDebug; + } + + /** + * Used for testing translation; not to be used outside of testing and/or debugging. Returns all parameters in a Map + * + * @return map of parameters, where key= variable name, value= the value of that variable + */ + @VisibleForTesting + public Map getParameters() { + Map params = new HashMap<>(); + + params.put("use_tls", String.valueOf(use_tls)); + params.put("server", server); + params.put("port", String.valueOf(port)); + params.put("use_ssl", String.valueOf(use_ssl)); + params.put("username", username); + params.put("password", password != null ? "***" : "NULL"); + params.put("fromEmail", fromEmail); + params.put("toEmails", toEmails); + params.put("bccEmails", bccEmails); + params.put("ccEmails", ccEmails); + params.put("subject", subject); + params.put("customMessageContent", customMessageContent); + params.put("format", format); + params.put("sendResults", String.valueOf(sendResults)); + params.put("inline", String.valueOf(inline)); + params.put("sendCsv", String.valueOf(sendCsv)); + params.put("sendPdf", String.valueOf(sendPdf)); + params.put("customFooterContent", customFooterContent); + params.put("paperSize", paperSize); + params.put("paperOrientation", paperOrientation); + params.put("content_type", content_type); + params.put("maxinputs", String.valueOf(maxCount)); + params.put("urlToParagraph", urlToParagraph); + params.put("smtpDebug", String.valueOf(smtpDebug)); + + return params; + } + + /** + * Has the call() function of this object been used even once? + * + * @return true or false + */ + public boolean getIsCalledBefore() { + return this.isCalledBefore; + } + + /** + * Builds email on maxInputs batches using a list of rows given + * + * @param rows List of rows + * @throws Exception Any exception that occurred during building and sending the email + */ + public void call(List rows) throws Exception { + this.isCalledBefore = true; + // Size of list of rows (collected dataframe) + int size = rows.size(); + // rows needed for current maxInputs email batch + int remainingToBeAdded = maxCount - count; + + // none remaining, build email + if (remainingToBeAdded == 0) { + buildEmail(listOfRows); + listOfRows.clear(); + count = 0; + } + // Needs more than currently in list of rows + else if (remainingToBeAdded > size) { + listOfRows.addAll(rows); + count += size; + } + // Current email batch needs less than in list of rows, + // call this function again with surplus rows + else { // remaining != 0, remaining < size + listOfRows.addAll(rows.subList(0, remainingToBeAdded)); + buildEmail(listOfRows); + listOfRows.clear(); + count = 0; + + List toAdd = rows.subList(remainingToBeAdded, rows.size()); + this.call(toAdd); + } + } + + /** + * Send email without rows (sendresults=false) + * + * @throws Exception Any exception that occurred during building and sending the email + */ + public void call() throws Exception { + this.isCalledBefore = true; + buildEmail(null); + } + + /** + * flushes the remaining rows that were not processed as they did not reach the target count + * + * @throws Exception Any error that occurred during the flush() + */ + public void flush() throws Exception { + LOGGER.info("Flushing email processor!"); + if (listOfRows != null && listOfRows.size() > 0) { + buildEmail(listOfRows); + listOfRows.clear(); + count = 0; + } + } + + /** + * Builds the email from current content of listOfRows + * + * @throws MessagingException + */ + private void buildEmail(List listOfRows) throws MessagingException { + LOGGER.info("Building email!"); + + // Properties instance used for setting up email parameters + Properties emailProp = new Properties(); + emailProp.put("mail.smtp.auth", use_ssl || use_tls); // use auth if either ssl or tls is used + emailProp.put("mail.smtp.starttls.enable", use_tls); + emailProp.put("mail.smtp.host", server); + emailProp.put("mail.smtp.port", port); + emailProp.put("mail.smtp.ssl.enable", use_ssl); + + LOGGER.info("Sendemail properties: <{}>", emailProp.entrySet()); + + // user and pass from zeppelin config + final Session session = Session.getInstance(emailProp, new Authenticator() { + + @Override + protected PasswordAuthentication getPasswordAuthentication() { + return new PasswordAuthentication(username, password); + } + }); + + // Set debugging, if given in zeppelin config + session.setDebug(this.smtpDebug); + + // Start building the email message + try { + final Message message = new MimeMessage(session); + + // from, to, cc and bcc emails + if (fromEmail != null) { + message.setFrom(new InternetAddress(fromEmail)); + } + + if (toEmails != null) { + message.setRecipients(Message.RecipientType.TO, InternetAddress.parse(toEmails)); + } + else { + throw new IllegalArgumentException( + "Sendemail command did not contain a required parameter: 'to=' !" + ); + } + + if (ccEmails != null) { + message.setRecipients(Message.RecipientType.CC, InternetAddress.parse(ccEmails)); + } + + if (bccEmails != null) { + message.setRecipients(Message.RecipientType.BCC, InternetAddress.parse(bccEmails)); + } + + // set subject if given in command, otherwise defaults to "Teragrep Results" + message.setSubject(subject != null ? subject : "Teragrep Results"); + + // line break based on content_type + String lineBreak = "\n"; + if (this.content_type.equals("html")) { + lineBreak = "
"; + } + + // Message depends on given parameters + // sendResults=false -> Search complete + // sendResults=true, inline=true, and no attachments -> Search results. + // sendResults=true with attachments -> Search results attached. + // + // Apply custom message if it was given in the command + String messageContent = ""; + if (customMessageContent == null) { + if (!sendResults) { + messageContent = "Search complete."; + } + else if (sendResults && inline && (!sendPdf || !sendCsv)) { + messageContent = "Search results."; + } + else if (sendResults && (sendPdf || sendCsv)) { + messageContent = "Search results attached."; + } + } + else { + messageContent = customMessageContent; + } + + // Add url to paragraph + if (this.urlToParagraph != null) { + if (this.content_type.equals("html")) { + messageContent += String + .format("
View results in Teragrep", this.urlToParagraph); + } + else { + // plain + messageContent += String.format("\n%s", this.urlToParagraph); + } + + } + + // Footer (again, custom footer will be applied if it was given in the command) + MimeBodyPart footerBodyPart = new MimeBodyPart(); + String footerContent = "This email was generated via the sendemail command. Not the correct recipient? Contact your Teragrep administrator." + + lineBreak + "Teragrep - Know Everything"; + if (customFooterContent != null) { + footerContent = customFooterContent; + } + footerBodyPart + .setContent(footerContent, content_type.equals("html") ? "text/html; charset=utf-8" : "text/plain; charset=utf-8"); + + // Full email will be assembled to MimeMultipart + final Multipart multipart = new MimeMultipart(); + + // Set messageContent to MimeBodyPart + MimeBodyPart messageBodyPart = new MimeBodyPart(); + messageBodyPart + .setContent(messageContent, content_type.equals("html") ? "text/html; charset=utf-8" : "text/plain; charset=utf-8"); + + // Add message bodypart to MimeMultipart + multipart.addBodyPart(messageBodyPart); + + // pdf paper settings + // Object hierarchy for pdf document + // pdfDoc (main document) <- page1 (page) <- table (outside table) <- t (internal data) + pdfDoc = new PDDocument(); + page1 = null; + + // set paper size + float paperWidth = 0f, paperHeight = 0f; + + switch (paperSize) { + case "legal": + paperWidth = PDRectangle.LEGAL.getWidth(); + paperHeight = PDRectangle.LEGAL.getHeight(); + break; + case "a2": + paperWidth = PDRectangle.A2.getWidth(); + paperHeight = PDRectangle.A2.getHeight(); + break; + case "a3": + case "ledger": + // ledger is equivalent to A3 + paperWidth = PDRectangle.A3.getWidth(); + paperHeight = PDRectangle.A3.getHeight(); + break; + case "a4": + paperWidth = PDRectangle.A4.getWidth(); + paperHeight = PDRectangle.A4.getHeight(); + break; + case "a5": + paperWidth = PDRectangle.A5.getWidth(); + paperHeight = PDRectangle.A5.getHeight(); + break; + case "letter": + default: + // letter is default + paperWidth = PDRectangle.LETTER.getWidth(); + paperHeight = PDRectangle.LETTER.getHeight(); + break; + } + + // Create page with set paper orientation and size + if (paperOrientation.equals("landscape")) { + page1 = new PDPage(new PDRectangle(paperHeight, paperWidth)); + } + else { + // default (portrait) + page1 = new PDPage(new PDRectangle(paperWidth, paperHeight)); + } + + String dataContent = null; + + // Get search results if sendResults = true + dataBodyPart = new MimeBodyPart(); + + // DatasetToTextBuilder init + DatasetToTextBuilder txtBuilder = new DatasetToTextBuilder(format, lineBreak); + if (sendResults && inline) { + dataContent = txtBuilder.build(listOfRows); + // set dataContent to dataBodyPart + dataBodyPart + .setContent(dataContent, content_type.equals("html") ? "text/html; charset=utf-8" : "text/plain; charset=utf-8"); + } + + // sendCsv and sendPdf + if (sendResults && sendCsv) { + if (!inline || format != "csv" || !lineBreak.equals("\n")) { + // dataContent is something other than csv + // re-generate csv version + txtBuilder = new DatasetToTextBuilder("csv", "\n"); + dataContent = txtBuilder.build(listOfRows); + } + + byte[] fileBase64ByteArray; + try { + fileBase64ByteArray = java.util.Base64.getEncoder().encode(dataContent.getBytes("UTF-8")); + // headers for attachment + InternetHeaders fileHeaders = new InternetHeaders(); + fileHeaders.setHeader("Content-Type", "text/csv; name=\"results.csv\""); + fileHeaders.setHeader("Content-Transfer-Encoding", "base64"); + fileHeaders.setHeader("Content-Disposition", "attachment; filename=\"results.csv\""); + // body part for attachment + attachmentBodyPart = new MimeBodyPart(fileHeaders, fileBase64ByteArray); + attachmentBodyPart.setFileName("results.csv"); + + multipart.addBodyPart(attachmentBodyPart); // attachment + + } + catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + } + else if (sendResults && sendPdf) { + if (!inline || format != "csv" || !lineBreak.equals("\n")) { + // dataContent is something other than csv + // re-generate csv version + // csv is used to generate table to pdf + txtBuilder = new DatasetToTextBuilder("csv", "\n"); + dataContent = txtBuilder.build(listOfRows); + } + + // Generate content + try { + // add page to main document and initialize contentStream + pdfDoc.addPage(page1); + PDPageContentStream contentStream = new PDPageContentStream(pdfDoc, page1); + + // Setup contentStream for text + contentStream.beginText(); + contentStream.setFont(PDType1Font.TIMES_ROMAN, 16); + contentStream.setLeading(14.5f); + + // table settings + float margin = 50; + float yStartNewPage = page1.getMediaBox().getHeight() - (2 * margin); + float tableWidth = page1.getMediaBox().getWidth() - (2 * margin); + boolean drawContent = true; + float yStart = yStartNewPage; + float bottomMargin = 70; + float yPosition = page1.getMediaBox().getHeight() - 100; + + // Generate BaseTable and DataTable + BaseTable table = new BaseTable( + yPosition, + yStartNewPage, + bottomMargin, + tableWidth, + margin, + pdfDoc, + page1, + true, + drawContent + ); + DataTable t = new DataTable(table, page1); + t.addCsvToTable(dataContent, DataTable.HASHEADER, ','); + + // write text to contentStream (title) + contentStream.newLineAtOffset(25, page1.getMediaBox().getHeight() - 50); + contentStream.showText("Teragrep Results"); + contentStream.endText(); + + // draw table and close stream + table.draw(); + contentStream.close(); + + // output pdf to stream + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + + //pdfDoc.save(new File("/tmp/pth10_" + UUID.randomUUID() + ".pdf")); // comment out - file system save for debugging + pdfDoc.save(outputStream); + pdfDoc.close(); + + // stream to base64 and attach to email with given headers + byte[] fileBase64ByteArray = java.util.Base64.getEncoder().encode(outputStream.toByteArray()); + + // headers for attachment + InternetHeaders fileHeaders = new InternetHeaders(); + fileHeaders.setHeader("Content-Type", "application/pdf; name=\"results.pdf\""); + fileHeaders.setHeader("Content-Transfer-Encoding", "base64"); + fileHeaders.setHeader("Content-Disposition", "attachment; filename=\"results.pdf\""); + + // body part for attachment + MimeBodyPart attachmentBodyPart = new MimeBodyPart(fileHeaders, fileBase64ByteArray); + attachmentBodyPart.setFileName("results.pdf"); + + multipart.addBodyPart(attachmentBodyPart); // attachment + + } + catch (IOException e) { + LOGGER.error("sendPdf IOException: <{}>", e.getMessage()); + e.printStackTrace(); + } + + } + + // add inline results to multipart if inline=true + if (sendResults && inline) { + if (dataBodyPart != null) { + multipart.addBodyPart(dataBodyPart); + } + } + + // Set multipart as the content for message and send + multipart.addBodyPart(footerBodyPart); // add footer + message.setContent(multipart); + Transport.send(message); + + } + catch (MessagingException me) { + LOGGER.error("An error occurred trying to send email using the sendemail command. Details:"); + me.printStackTrace(); + + // FIXME: Implement: Throw an exception if not in graceful mode + //if (!graceful) { + throw new RuntimeException("Error sending email using sendemail command! Details: " + me.getMessage()); + //} + } + } } diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/teragrep/HdfsSaveMetadata.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/teragrep/HdfsSaveMetadata.java index 8d619f1..9f1126e 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/teragrep/HdfsSaveMetadata.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/teragrep/HdfsSaveMetadata.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.teragrep; import org.apache.spark.sql.types.StructType; @@ -53,12 +52,14 @@ import java.util.Map; /** - * Object used to save metadata regarding the HDFS saved dataset. - * Contains the schema, save timestamp and retention timespan. + * Object used to save metadata regarding the HDFS saved dataset. Contains the schema, save timestamp and retention + * timespan. */ public class HdfsSaveMetadata implements Serializable { + // stub object? private final boolean isStub; + public HdfsSaveMetadata() { this.isStub = false; } @@ -66,6 +67,7 @@ public HdfsSaveMetadata() { public HdfsSaveMetadata(boolean isStub) { this.isStub = isStub; } + private static final long serialVersionUID = 1L; // schema that is used when saving to hdfs diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/teragrep/SyslogStreamer.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/teragrep/SyslogStreamer.java index 830003e..3b51d69 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/teragrep/SyslogStreamer.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/teragrep/SyslogStreamer.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.commands.transformstatement.teragrep; import com.cloudbees.syslog.Facility; @@ -105,14 +104,15 @@ public void setAppName(String appName) { } /** - * Base constructor for the SyslogStreamer. Use the {@link #SyslogStreamer(String, int) secondary constructor} - * and provide it with the relp server hostname and port instead. + * Base constructor for the SyslogStreamer. Use the {@link #SyslogStreamer(String, int) secondary constructor} and + * provide it with the relp server hostname and port instead. */ public SyslogStreamer() { } /** * Constructor for SyslogStreamer, provide the RELP server's hostname and port. + * * @param relpHost relp server hostname/ip address * @param relpPort relp server port */ @@ -123,6 +123,7 @@ public SyslogStreamer(String relpHost, int relpPort) { /** * Connects to the RELP server relpHost:relpPort
+ * * @throws RuntimeException if the server is unavailable for more than {@link #maxFailedConnectionAttempts} */ private void connect() { @@ -139,22 +140,32 @@ private void connect() { } if (connected) { - LOGGER.info("SyslogStreamer connected to RELP server host=<[{}]> port=<[{}]> !",relpHostAddress, relpPort); + LOGGER + .info( + "SyslogStreamer connected to RELP server host=<[{}]> port=<[{}]> !", relpHostAddress, + relpPort + ); failedConnectionAttempts = 0; notConnected = false; } else { if (failedConnectionAttempts++ >= maxFailedConnectionAttempts) { - throw new RuntimeException("Connection to RELP server failed more times than allowed. " + - "(Maximum " + maxFailedConnectionAttempts + " times)"); + throw new RuntimeException( + "Connection to RELP server failed more times than allowed. " + "(Maximum " + + maxFailedConnectionAttempts + " times)" + ); } try { - LOGGER.warn("Connection to RELP server was unsuccessful, attempting again in <{}> ms", reconnectInterval); + LOGGER + .warn( + "Connection to RELP server was unsuccessful, attempting again in <{}> ms", + reconnectInterval + ); Thread.sleep(this.reconnectInterval); } catch (InterruptedException e) { - LOGGER.error("An error occurred while waiting for reconnection: <{}>", e.getMessage()); + LOGGER.error("An error occurred while waiting for reconnection: <{}>", e.getMessage()); } } } @@ -220,7 +231,8 @@ private void stop() { * Main mapping function call function. Initializes the RELP sender, connects to the RELP server,
* and builds a syslog message from each of the rows given to this function.
* To be used with the dataset map() function.
- * E.g. ds.map(new SyslogStreamer(host, port), ds.exprEnc());

+ * E.g. ds.map(new SyslogStreamer(host, port), ds.exprEnc());
+ *
* * @param row Input row to send as syslog * @return the given input row unchanged @@ -262,18 +274,18 @@ else if (field.name().equals("_raw")) { // If _time column didn't exist, get current time as the syslog message time if (!timeSetFromColumn) { - time = Instant.now().getEpochSecond()*1000L; + time = Instant.now().getEpochSecond() * 1000L; } // build the syslog message final SyslogMessage syslogMessage = new SyslogMessage() - .withTimestamp(time) // _time column as syslog message time + .withTimestamp(time) // _time column as syslog message time .withSeverity(Severity.WARNING) .withAppName(appName) .withHostname(hostname) .withFacility(Facility.USER) - .withSDElement(teragrep_output_48577) // teragrep-output@48577 SDElement - .withMsg(payload); // _raw column as syslog payload + .withSDElement(teragrep_output_48577) // teragrep-output@48577 SDElement + .withMsg(payload); // _raw column as syslog payload // send to server this.append(syslogMessage); @@ -283,6 +295,7 @@ else if (field.name().equals("_raw")) { /** * Appends the given syslog message into the relp batch, and sends it to the server. + * * @param syslogMessage the message to be appended to the batch, and sent to the server. */ private void append(SyslogMessage syslogMessage) { @@ -307,7 +320,8 @@ private void append(SyslogMessage syslogMessage) { this.tearDown(); try { Thread.sleep(this.reconnectInterval); - } catch (InterruptedException e) { + } + catch (InterruptedException e) { LOGGER.warn("Reconnect sleep was interrupted", e); } this.connect(); diff --git a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/xmlkv/XmlkvUDF.java b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/xmlkv/XmlkvUDF.java index c1b4c88..28ac1bf 100644 --- a/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/xmlkv/XmlkvUDF.java +++ b/src/main/java/com/teragrep/pth10/ast/commands/transformstatement/xmlkv/XmlkvUDF.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -60,7 +60,9 @@ import java.util.Map; public class XmlkvUDF implements UDF1> { + private static final Logger LOGGER = LoggerFactory.getLogger(XmlkvUDF.class); + @Override public Map call(String input) throws Exception { Map m = new HashMap<>(); @@ -72,19 +74,24 @@ public Map call(String input) throws Exception { Node n = doc.getDocumentElement(); buildMapFromXmlNodes(n, m); - } catch (SAXParseException spe) { + } + catch (SAXParseException spe) { // don't catch other than parse errors - LOGGER.warn("Could not parse col <{}> on line <{}>, returning empty.", spe.getColumnNumber(), spe.getLineNumber()); + LOGGER + .warn( + "Could not parse col <{}> on line <{}>, returning empty.", spe.getColumnNumber(), + spe.getLineNumber() + ); } return m; } /** - * Gets all latest occurrences of tag-contents pairs - *
deepest node => its contents
+ * Gets all latest occurrences of tag-contents pairs
deepest node => its contents
+ * * @param rootNode root node (Main Document Element) - * @param map Final map to be returned out of the UDF + * @param map Final map to be returned out of the UDF */ private void buildMapFromXmlNodes(final Node rootNode, final Map map) { // RootNode is text diff --git a/src/main/java/com/teragrep/pth10/ast/time/RelativeOffset.java b/src/main/java/com/teragrep/pth10/ast/time/RelativeOffset.java index ebd39d8..1b7773f 100644 --- a/src/main/java/com/teragrep/pth10/ast/time/RelativeOffset.java +++ b/src/main/java/com/teragrep/pth10/ast/time/RelativeOffset.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,10 +43,8 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.time; -import com.teragrep.pth10.steps.teragrep.TeragrepSyslogStep; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -60,6 +58,7 @@ * Relative offset of time. Used to add or subtract time. */ public final class RelativeOffset { + private static final Logger LOGGER = LoggerFactory.getLogger(RelativeOffset.class); private final long amount; @@ -107,11 +106,13 @@ public Instant addOffset(Instant time) { default: throw new RuntimeException("Relative timestamp contained an invalid time unit"); } - } catch (ArithmeticException ae) { + } + catch (ArithmeticException ae) { // on overflow, check positivity/negativity and pin to max/min if (amount < 0) { time = Instant.ofEpochMilli(0); - } else { + } + else { time = Instant.ofEpochMilli(Long.MAX_VALUE); } ldt = time.atZone(ZoneId.systemDefault()).toLocalDateTime(); @@ -121,12 +122,13 @@ public Instant addOffset(Instant time) { LOGGER.info("Epoch resulted in year over 9999, setting it to year 9999."); ldt = ldt.withYear(9999); time = ldt.atZone(ZoneId.systemDefault()).toInstant(); - } else if (ldt.getYear() < 1000) { + } + else if (ldt.getYear() < 1000) { LOGGER.info("Epoch resulted in year less than 1000, setting it to year 1000."); ldt = ldt.withYear(1000); time = ldt.atZone(ZoneId.systemDefault()).toInstant(); } - + return time; } } diff --git a/src/main/java/com/teragrep/pth10/ast/time/RelativeTimeParser.java b/src/main/java/com/teragrep/pth10/ast/time/RelativeTimeParser.java index 614359b..689b4d2 100644 --- a/src/main/java/com/teragrep/pth10/ast/time/RelativeTimeParser.java +++ b/src/main/java/com/teragrep/pth10/ast/time/RelativeTimeParser.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.time; import com.teragrep.pth10.ast.TextString; @@ -60,13 +59,7 @@ public class RelativeTimeParser { enum OffsetUnit { - SECONDS, - MINUTES, - HOURS, - DAYS, - WEEKS, - MONTHS, - YEARS + SECONDS, MINUTES, HOURS, DAYS, WEEKS, MONTHS, YEARS } enum SnapUnit { @@ -94,6 +87,7 @@ public RelativeTimeParser() { /** * Parses the given String into a RelativeTimestamp object. + * * @param timestamp relative time as string, ex. -12h@day * @return relative time as object */ @@ -101,7 +95,9 @@ public RelativeTimestamp parse(String timestamp) { timestamp = new UnquotedText(new TextString(timestamp)).read(); // strip quotes // regex that should match all types of relative timestamps but not normal timestamps - Matcher relativeTimeMatcher = Pattern.compile("^((-|\\+)(\\d*[A-Za-z]+))?(@[A-Za-z]+(-|\\+)?[\\dA-Za-z]*)?").matcher(timestamp); + Matcher relativeTimeMatcher = Pattern + .compile("^((-|\\+)(\\d*[A-Za-z]+))?(@[A-Za-z]+(-|\\+)?[\\dA-Za-z]*)?") + .matcher(timestamp); // no match and isn't keyword "now" -> assume it is a normal timestamp and use unixEpochFromString() if (!relativeTimeMatcher.matches() && !timestamp.equalsIgnoreCase("now")) { @@ -120,7 +116,8 @@ public RelativeTimestamp parse(String timestamp) { offset = parseRelativeOffset(offsetTimestamp); snap = parseSnapToTime(snapTimestamp); - } else { + } + else { // only offset present, or incorrect timestamp offset = parseRelativeOffset(timestamp); } @@ -199,7 +196,8 @@ private SnapToTime parseSnapToTime(String timestamp) { if (relativeOffset == null) { // snap without offset snapToTime = new SnapToTime(snapUnit); - } else { + } + else { // with offset snapToTime = new SnapToTime(snapUnit, relativeOffset); } diff --git a/src/main/java/com/teragrep/pth10/ast/time/RelativeTimestamp.java b/src/main/java/com/teragrep/pth10/ast/time/RelativeTimestamp.java index 50d71e1..891ba1b 100644 --- a/src/main/java/com/teragrep/pth10/ast/time/RelativeTimestamp.java +++ b/src/main/java/com/teragrep/pth10/ast/time/RelativeTimestamp.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.time; import java.sql.Timestamp; @@ -61,6 +60,7 @@ public RelativeTimestamp(RelativeOffset offset, SnapToTime snapToTime) { /** * Calculate epoch time from relative time modifier. IE. now()- time range + * * @param timestamp A moment in time, usually the current time * @return Calculated time as epoch milliseconds */ @@ -68,10 +68,13 @@ public long calculate(Timestamp timestamp) { Instant time = timestamp.toInstant(); // if both are null, "now" option is left. Therefore, returns current time. - if (offset == null && snapToTime == null) time = new Timestamp(System.currentTimeMillis()).toInstant(); + if (offset == null && snapToTime == null) + time = new Timestamp(System.currentTimeMillis()).toInstant(); - if (offset != null) time = offset.addOffset(time); - if (snapToTime != null) time = snapToTime.snap(time); + if (offset != null) + time = offset.addOffset(time); + if (snapToTime != null) + time = snapToTime.snap(time); return time.getEpochSecond(); } } diff --git a/src/main/java/com/teragrep/pth10/ast/time/SnapToTime.java b/src/main/java/com/teragrep/pth10/ast/time/SnapToTime.java index 0652c30..af2071b 100644 --- a/src/main/java/com/teragrep/pth10/ast/time/SnapToTime.java +++ b/src/main/java/com/teragrep/pth10/ast/time/SnapToTime.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.ast.time; import java.time.*; @@ -51,8 +50,7 @@ import java.time.temporal.TemporalAdjusters; /** - * Truncates time to the given unit. Can contain an offset. - * Example: @d+3h snaps to 3AM of the same day. + * Truncates time to the given unit. Can contain an offset. Example: @d+3h snaps to 3AM of the same day. */ public final class SnapToTime { @@ -71,6 +69,7 @@ public SnapToTime(RelativeTimeParser.SnapUnit snapUnit) { /** * Truncate the time to the start of the SnapUnit and add the offset. + * * @param time original time * @return modified time */ @@ -163,7 +162,7 @@ else if ((q3_ldt.isBefore(ldt) || q3_ldt.isEqual(ldt)) && q4_ldt.isAfter(ldt)) { time = q3_ldt.atZone(ZoneId.systemDefault()).toInstant(); } // After Q4 - else if ((q4_ldt.isBefore(ldt) || q4_ldt.isEqual(ldt)) ) { + else if ((q4_ldt.isBefore(ldt) || q4_ldt.isEqual(ldt))) { time = q4_ldt.atZone(ZoneId.systemDefault()).toInstant(); } else { diff --git a/src/main/java/com/teragrep/pth10/datasources/ArchiveQuery.java b/src/main/java/com/teragrep/pth10/datasources/ArchiveQuery.java index 76c5dc4..dc4f745 100644 --- a/src/main/java/com/teragrep/pth10/datasources/ArchiveQuery.java +++ b/src/main/java/com/teragrep/pth10/datasources/ArchiveQuery.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -45,19 +45,18 @@ */ package com.teragrep.pth10.datasources; - /** - * Class representing an archive query. The query is in XML format and is used - * by the PTH-06 Datasource Component. If no query is provided, the constructor will - * set the isStub property as true, which should be used instead of a null check. + * Class representing an archive query. The query is in XML format and is used by the PTH-06 Datasource Component. If no + * query is provided, the constructor will set the isStub property as true, which should be used instead of a null + * check. */ public final class ArchiveQuery { + public final String queryString; public final boolean isStub; /** - * Case: No query given. Traditionally this would be the null case. - * Check for a "null case" using the isStub field. + * Case: No query given. Traditionally this would be the null case. Check for a "null case" using the isStub field. */ public ArchiveQuery() { this.queryString = ""; @@ -65,8 +64,9 @@ public ArchiveQuery() { } /** - * Case: A non-empty query given. Traditionally this would be a non-null case. - * It does not guarantee that the query is 100% valid, only that it is of a non-null value. + * Case: A non-empty query given. Traditionally this would be a non-null case. It does not guarantee that the query + * is 100% valid, only that it is of a non-null value. + * * @param queryString XML-formatted string for the Archive */ public ArchiveQuery(final String queryString) { diff --git a/src/main/java/com/teragrep/pth10/datasources/DPLDatasource.java b/src/main/java/com/teragrep/pth10/datasources/DPLDatasource.java index bec4c5c..7c63554 100644 --- a/src/main/java/com/teragrep/pth10/datasources/DPLDatasource.java +++ b/src/main/java/com/teragrep/pth10/datasources/DPLDatasource.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.datasources; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -63,6 +62,7 @@ * DPL Datasource, used for archive and kafka queries */ public class DPLDatasource { + private static final Logger LOGGER = LoggerFactory.getLogger(DPLDatasource.class); private final Config config; @@ -81,7 +81,6 @@ public DPLDatasource(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; } - public Dataset constructStreams(ArchiveQuery archiveQuery, boolean isMetadataQuery) { // resolve archive Query which is then used with archiveDatasource LOGGER.info("DPL Interpreter ArchiveQuery=<[{}]>", archiveQuery); @@ -97,9 +96,9 @@ public Dataset constructStreams(ArchiveQuery archiveQuery, boolean isMetada return archiveDS; } - /** * Setup source stream for query + * * @param query * @return streaming dataset */ @@ -123,48 +122,49 @@ private Dataset archiveStreamConsumerDataset(ArchiveQuery query, boolean is LOGGER.debug("Creating ArchiveSourceProvider"); reader = sparkSession - .readStream() - .format(com.teragrep.pth_06.TeragrepDatasource.class.getName()) - .option("num_partitions", config.getString("dpl.pth_06.partitions")) - .option("S3endPoint", config.getString("fs.s3a.endpoint")) - .option("S3identity", s3identity) - .option("S3credential", s3credential) - .option("DBusername", config.getString("dpl.pth_06.archive.db.username")) - .option("DBpassword", config.getString("dpl.pth_06.archive.db.password")) - .option("DBurl", config.getString("dpl.pth_06.archive.db.url")) - .option("DBstreamdbname", config.getString("dpl.pth_06.archive.db.streamdb.name")) - .option("DBjournaldbname", config.getString("dpl.pth_06.archive.db.journaldb.name")) - .option("hideDatabaseExceptions", config.getString("dpl.pth_06.archive.db.hideDatabaseExceptions")) - .option("skipNonRFC5424Files", config.getString("dpl.pth_06.archive.s3.skipNonRFC5424Files")) - .option("queryXML", query.queryString); + .readStream() + .format(com.teragrep.pth_06.TeragrepDatasource.class.getName()) + .option("num_partitions", config.getString("dpl.pth_06.partitions")) + .option("S3endPoint", config.getString("fs.s3a.endpoint")) + .option("S3identity", s3identity) + .option("S3credential", s3credential) + .option("DBusername", config.getString("dpl.pth_06.archive.db.username")) + .option("DBpassword", config.getString("dpl.pth_06.archive.db.password")) + .option("DBurl", config.getString("dpl.pth_06.archive.db.url")) + .option("DBstreamdbname", config.getString("dpl.pth_06.archive.db.streamdb.name")) + .option("DBjournaldbname", config.getString("dpl.pth_06.archive.db.journaldb.name")) + .option("hideDatabaseExceptions", config.getString("dpl.pth_06.archive.db.hideDatabaseExceptions")) + .option("skipNonRFC5424Files", config.getString("dpl.pth_06.archive.s3.skipNonRFC5424Files")) + .option("queryXML", query.queryString); // Add auditInformation options if exists - if( catCtx != null && catCtx.getAuditInformation() != null) { + if (catCtx != null && catCtx.getAuditInformation() != null) { LOGGER.debug("Adding auditInformation"); reader = reader .option("TeragrepAuditQuery", catCtx.getAuditInformation().getQuery()) .option("TeragrepAuditReason", catCtx.getAuditInformation().getReason()) .option("TeragrepAuditUser", catCtx.getAuditInformation().getUser()) - .option("TeragrepAuditPluginClassName", catCtx.getAuditInformation().getTeragrepAuditPluginClassName()); + .option( + "TeragrepAuditPluginClassName", + catCtx.getAuditInformation().getTeragrepAuditPluginClassName() + ); } if (config.getBoolean("dpl.pth_06.archive.enabled")) { LOGGER.debug("Archive is enabled"); - reader = reader - .option("archive.enabled", "true"); + reader = reader.option("archive.enabled", "true"); } else { LOGGER.debug("Archive is disabled"); - reader = reader - .option("archive.enabled", "false"); + reader = reader.option("archive.enabled", "false"); } if (config.hasPath("dpl.pth_06.archive.scheduler")) { String schedulerType = config.getString("dpl.pth_06.archive.scheduler"); LOGGER.debug("Setting scheduler to <[{}]>", schedulerType); if (schedulerType != null && !schedulerType.isEmpty()) { - reader = reader - .option("scheduler", schedulerType); - } else { + reader = reader.option("scheduler", schedulerType); + } + else { LOGGER.warn("DPLDatasource> dpl.pth_06.archive.scheduler given value was null or empty"); } } @@ -197,7 +197,8 @@ private Dataset archiveStreamConsumerDataset(ArchiveQuery query, boolean is LOGGER.debug("Found domainIndex, removing domain"); s3identityWithoutDomain = s3identityWithoutDomain.substring(0, domainIndex); } - String jaasconfig = "org.apache.kafka.common.security.plain.PlainLoginModule required username=\""+s3identityWithoutDomain+"\" password=\""+s3credential+"\";"; + String jaasconfig = "org.apache.kafka.common.security.plain.PlainLoginModule required username=\"" + + s3identityWithoutDomain + "\" password=\"" + s3credential + "\";"; LOGGER.debug("Adding kafka configuration to reader"); reader = reader @@ -209,7 +210,10 @@ private Dataset archiveStreamConsumerDataset(ArchiveQuery query, boolean is .option("kafka.max.poll.records", config.getString("dpl.pth_06.kafka.max.poll.records")) .option("kafka.fetch.max.bytes", config.getString("dpl.pth_06.kafka.fetch.max.bytes")) .option("kafka.fetch.max.wait.ms", config.getString("dpl.pth_06.kafka.fetch.max.wait.ms")) - .option("kafka.max.partition.fetch.bytes", config.getString("dpl.pth_06.kafka.max.partition.fetch.bytes")) + .option( + "kafka.max.partition.fetch.bytes", + config.getString("dpl.pth_06.kafka.max.partition.fetch.bytes") + ) .option("kafka.continuousProcessing", config.getString("dpl.pth_06.kafka.continuousProcessing")); } diff --git a/src/main/java/com/teragrep/pth10/datasources/GeneratedDatasource.java b/src/main/java/com/teragrep/pth10/datasources/GeneratedDatasource.java index d37d07e..cf0b4c4 100644 --- a/src/main/java/com/teragrep/pth10/datasources/GeneratedDatasource.java +++ b/src/main/java/com/teragrep/pth10/datasources/GeneratedDatasource.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,10 +43,8 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.datasources; -import com.teragrep.pth_06.TeragrepDatasource; import com.teragrep.pth10.ast.DPLParserCatalystContext; import com.typesafe.config.Config; import org.apache.spark.sql.*; @@ -81,25 +79,24 @@ * Datasource generator class */ public class GeneratedDatasource { + private static final Logger LOGGER = LoggerFactory.getLogger(GeneratedDatasource.class); private final Config config; private SparkSession sparkSession; private DPLParserCatalystContext catCtx; - private final StructType schema = new StructType( - new StructField[]{ - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("origin", DataTypes.StringType, false, new MetadataBuilder().build()), - } - ); + private final StructType schema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("origin", DataTypes.StringType, false, new MetadataBuilder().build()), + }); public GeneratedDatasource(DPLParserCatalystContext catCtx) { this.catCtx = catCtx; @@ -113,107 +110,89 @@ public Dataset constructEmptyStream() throws StreamingQueryException { MemoryStream rowMemoryStream = new MemoryStream<>(1, sqlContext, Option.apply(1), encoder); Dataset rowDataset = rowMemoryStream.toDF(); - final String queryName = "construct_empty_" + ((int)(Math.random() * 100000)); + final String queryName = "construct_empty_" + ((int) (Math.random() * 100000)); DataStreamWriter writer = rowDataset.writeStream().format("memory").outputMode(OutputMode.Append()); // generate one row with practically no data // TODO: Not generating any rows stops any progress. Figure out a way to generate a empty *streaming* dataset! - rowMemoryStream.addData(makeRows( - new java.sql.Timestamp(0L), - Collections.singletonList(""), - "", - "", - "", - "", - "", - -1L, - "" - )); + rowMemoryStream + .addData(makeRows(new java.sql.Timestamp(0L), Collections.singletonList(""), "", "", "", "", "", -1L, "")); StreamingQuery sq = this.catCtx.getInternalStreamingQueryListener().registerQuery(queryName, writer); sq.awaitTermination(); // filter the one generated row out to have a truly empty dataset - return rowDataset - .where(functions.col("offset") - .geq(functions.lit(0))); + return rowDataset.where(functions.col("offset").geq(functions.lit(0))); } - public Dataset constructStream(String status, String explainStr) throws StreamingQueryException, InterruptedException, UnknownHostException { + public Dataset constructStream(String status, String explainStr) + throws StreamingQueryException, InterruptedException, UnknownHostException { List lines = new ArrayList<>(); lines.add(status); return constructStream(lines, explainStr); } - public Dataset constructStream(List strings, String commandStr) throws StreamingQueryException, InterruptedException, UnknownHostException { + public Dataset constructStream(List strings, String commandStr) + throws StreamingQueryException, InterruptedException, UnknownHostException { SQLContext sqlContext = sparkSession.sqlContext(); ExpressionEncoder encoder = RowEncoder.apply(schema); - MemoryStream rowMemoryStream = - new MemoryStream<>(1, sqlContext, Option.apply(1), encoder); + MemoryStream rowMemoryStream = new MemoryStream<>(1, sqlContext, Option.apply(1), encoder); if (commandStr == null) { commandStr = "Unspecified"; } Dataset rowDataset = rowMemoryStream.toDF(); - final String queryName = "construct_" + ((int)(Math.random() * 100000)); + final String queryName = "construct_" + ((int) (Math.random() * 100000)); - DataStreamWriter writer = rowDataset - .writeStream() - .format("memory") - .outputMode("append"); + DataStreamWriter writer = rowDataset.writeStream().format("memory").outputMode("append"); long offset = 0; String host = InetAddress.getLocalHost().getHostName(); final String explainStr = commandStr; Timestamp time = Timestamp.valueOf(LocalDateTime.ofInstant(Instant.now(), ZoneOffset.UTC)); - rowMemoryStream.addData( - // make rows containing counter as offset and run as partition - makeRows( - time, // 0 "_time", DataTypes.TimestampType - strings, // 1 "_raw", DataTypes.StringType - "_internal", // 2 "index", DataTypes.StringType - explainStr, // 3 "sourcetype", DataTypes.StringType - host, // 4 "host", DataTypes.StringType, - "teragrep", // 5 "input", DataTypes.StringType - sparkSession.sparkContext().applicationId(), // 6 "partition", DataTypes.StringType - offset, // 7 "offset", DataTypes.LongType - "original-host" // 8 "origin", DataTypes.StringType - ) - ); - - StreamingQuery streamingQuery = this.catCtx.getInternalStreamingQueryListener().registerQuery(queryName, writer); + rowMemoryStream + .addData( + // make rows containing counter as offset and run as partition + makeRows( + time, // 0 "_time", DataTypes.TimestampType + strings, // 1 "_raw", DataTypes.StringType + "_internal", // 2 "index", DataTypes.StringType + explainStr, // 3 "sourcetype", DataTypes.StringType + host, // 4 "host", DataTypes.StringType, + "teragrep", // 5 "input", DataTypes.StringType + sparkSession.sparkContext().applicationId(), // 6 "partition", DataTypes.StringType + offset, // 7 "offset", DataTypes.LongType + "original-host" // 8 "origin", DataTypes.StringType + ) + ); + + StreamingQuery streamingQuery = this.catCtx + .getInternalStreamingQueryListener() + .registerQuery(queryName, writer); streamingQuery.awaitTermination(); return rowDataset; } - private Seq makeRows(Timestamp _time, - List _raw, - String index, - String sourcetype, - String host, - String source, - String partition, - Long offset, - String origin){ + private Seq makeRows( + Timestamp _time, + List _raw, + String index, + String sourcetype, + String host, + String source, + String partition, + Long offset, + String origin + ) { ArrayList rowArrayList = new ArrayList<>(); - _raw.forEach(s->{ - Row row = RowFactory.create( - _time, - s, - index, - sourcetype, - host, - source, - partition, - offset, - origin - ); + _raw.forEach(s -> { + Row row = RowFactory.create(_time, s, index, sourcetype, host, source, partition, offset, origin); rowArrayList.add(row); }); Seq rowSeq = JavaConverters.asScalaIteratorConverter(rowArrayList.iterator()).asScala().toSeq(); diff --git a/src/main/java/com/teragrep/pth10/datasources/S3CredentialWallet.java b/src/main/java/com/teragrep/pth10/datasources/S3CredentialWallet.java index 8c81a9d..afb42c7 100644 --- a/src/main/java/com/teragrep/pth10/datasources/S3CredentialWallet.java +++ b/src/main/java/com/teragrep/pth10/datasources/S3CredentialWallet.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.datasources; import org.apache.spark.SparkContext; @@ -54,6 +53,7 @@ * Class to access S3 credentials */ public class S3CredentialWallet { + final private SparkSession spark; final private String identity; final private String path; @@ -68,6 +68,7 @@ public S3CredentialWallet(SparkContext sc) { /** * Gets the credentials from HDFS + * * @return credentials */ public String getCredential() { @@ -79,11 +80,12 @@ public String getCredential() { catch (Exception ignored) { } - return null; + return null; } /** * Gets the current identity + * * @return system username */ public String getIdentity() { diff --git a/src/main/java/com/teragrep/pth10/steps/AbstractStep.java b/src/main/java/com/teragrep/pth10/steps/AbstractStep.java index 6d43cd1..5c0f19a 100644 --- a/src/main/java/com/teragrep/pth10/steps/AbstractStep.java +++ b/src/main/java/com/teragrep/pth10/steps/AbstractStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps; import org.apache.spark.sql.Dataset; @@ -54,6 +53,7 @@ import java.util.Set; public abstract class AbstractStep { + public enum CommandProperty { USES_INTERNAL_BATCHCOLLECT, // Command has an internal batch collect, e.g. sort IGNORE_DEFAULT_SORTING, // Command applies a certain order to the rows @@ -83,6 +83,7 @@ public AbstractStep() { /** * Perform the necessary dataframe operations for the implemented command + * * @param dataset Dataset to operate on * @return Dataframe, which has the operations applied */ diff --git a/src/main/java/com/teragrep/pth10/steps/EmptyDataframeStep.java b/src/main/java/com/teragrep/pth10/steps/EmptyDataframeStep.java index a59b023..49a1d4e 100644 --- a/src/main/java/com/teragrep/pth10/steps/EmptyDataframeStep.java +++ b/src/main/java/com/teragrep/pth10/steps/EmptyDataframeStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -50,10 +50,11 @@ import org.apache.spark.sql.SparkSession; /** - * Used to provide a empty dataframe in case of not using a LogicalStatement in the command. - * For example, the '| makeresults' command or some of the Teragrep utility commands. + * Used to provide a empty dataframe in case of not using a LogicalStatement in the command. For example, the '| + * makeresults' command or some of the Teragrep utility commands. */ public class EmptyDataframeStep extends AbstractStep { + @Override public Dataset get(Dataset dataset) { if (dataset != null) { diff --git a/src/main/java/com/teragrep/pth10/steps/Flushable.java b/src/main/java/com/teragrep/pth10/steps/Flushable.java index 4e40245..39b4bec 100644 --- a/src/main/java/com/teragrep/pth10/steps/Flushable.java +++ b/src/main/java/com/teragrep/pth10/steps/Flushable.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps; /** @@ -52,8 +51,8 @@ public interface Flushable { /** - * Flush any remaining data rows from the command. - * Should be called after the whole query to ensure that the last batch of data is also processed correctly. + * Flush any remaining data rows from the command. Should be called after the whole query to ensure that the last + * batch of data is also processed correctly. */ void flush(); } diff --git a/src/main/java/com/teragrep/pth10/steps/NullStep.java b/src/main/java/com/teragrep/pth10/steps/NullStep.java index 09f1431..d25d73d 100644 --- a/src/main/java/com/teragrep/pth10/steps/NullStep.java +++ b/src/main/java/com/teragrep/pth10/steps/NullStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -51,7 +51,8 @@ /** * A dummy step that returns the given dataset as-is */ -public class NullStep extends AbstractStep{ +public class NullStep extends AbstractStep { + @Override public Dataset get(Dataset dataset) { return dataset; diff --git a/src/main/java/com/teragrep/pth10/steps/ParsedResult.java b/src/main/java/com/teragrep/pth10/steps/ParsedResult.java index 7f3151f..f181294 100644 --- a/src/main/java/com/teragrep/pth10/steps/ParsedResult.java +++ b/src/main/java/com/teragrep/pth10/steps/ParsedResult.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps; import java.io.Serializable; @@ -54,13 +53,11 @@ * Wrapper class for the result of parsing a String that might be a number. */ public class ParsedResult implements Serializable { + private static final long serialVersionUID = 1L; public enum Type { - LONG, - DOUBLE, - STRING, - LIST + LONG, DOUBLE, STRING, LIST } private Long longValue = null; diff --git a/src/main/java/com/teragrep/pth10/steps/TypeParser.java b/src/main/java/com/teragrep/pth10/steps/TypeParser.java index 2783cfc..bb045b0 100644 --- a/src/main/java/com/teragrep/pth10/steps/TypeParser.java +++ b/src/main/java/com/teragrep/pth10/steps/TypeParser.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps; import java.io.Serializable; @@ -53,10 +52,12 @@ import java.util.List; public class TypeParser implements Serializable { + private static final long serialVersionUID = 1L; /** * Checks if the Object contains a String, Double or a Long value and returns it as a ParsedResult object. + * * @param input Object to parse * @return String, Double or Long wrapped in a ParsedResult */ @@ -69,43 +70,54 @@ public ParsedResult parse(Object input) { try { // try to parse into long parsedResult = new ParsedResult(Long.valueOf((String) input)); - } catch (NumberFormatException nfe) { + } + catch (NumberFormatException nfe) { try { // try to parse into double parsedResult = new ParsedResult(Double.valueOf((String) input)); - } catch (NumberFormatException ignored) { + } + catch (NumberFormatException ignored) { // returns as String } } - } else if (input instanceof Long) { + } + else if (input instanceof Long) { parsedResult = new ParsedResult((Long) input); - } else if (input instanceof WrappedArray) { + } + else if (input instanceof WrappedArray) { WrappedArray wr = (WrappedArray) input; if (wr.length() == 1) { // reparse arrays with one item parsedResult = parse(wr.head()); - } else if (wr.isEmpty()) { + } + else if (wr.isEmpty()) { // empty arrays will be empty string parsedResult = new ParsedResult(""); - } else { + } + else { // arrays with more than one item will be LIST parsedResult = new ParsedResult(JavaConversions.bufferAsJavaList(wr.toBuffer())); } - } else if (input instanceof List) { + } + else if (input instanceof List) { List l = (List) input; if (l.size() == 1) { parsedResult = new ParsedResult(String.valueOf(l.get(0))); - } else if (l.isEmpty()) { + } + else if (l.isEmpty()) { parsedResult = new ParsedResult(""); - } else { + } + else { parsedResult = new ParsedResult(l); } - } else { + } + else { try { // Turns integers into double as well parsedResult = new ParsedResult(Double.valueOf(input.toString())); - } catch (NumberFormatException ignored) { + } + catch (NumberFormatException ignored) { // returns as String } } diff --git a/src/main/java/com/teragrep/pth10/steps/accum/AccumStep.java b/src/main/java/com/teragrep/pth10/steps/accum/AccumStep.java index 4e8a770..39f0d02 100644 --- a/src/main/java/com/teragrep/pth10/steps/accum/AccumStep.java +++ b/src/main/java/com/teragrep/pth10/steps/accum/AccumStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -65,9 +65,11 @@ import java.util.List; public class AccumStep extends AbstractStep implements Serializable { + private final String sourceField; private final String renameField; private final NullValue nullValue; + public AccumStep(final NullValue nullValue, final String sourceField, final String renameField) { super(); this.properties.add(CommandProperty.IGNORE_DEFAULT_SORTING); @@ -80,43 +82,50 @@ public AccumStep(final NullValue nullValue, final String sourceField, final Stri public Dataset get(Dataset dataset) { // group all events under the same group (=0) final String groupCol = "$$accum_internal_grouping_col$$"; - Dataset dsWithGroupCol = dataset.withColumn(groupCol, - functions.lit(0)); + Dataset dsWithGroupCol = dataset.withColumn(groupCol, functions.lit(0)); // Create output encoder for results ExpressionEncoder outputEncoder; if (renameField.isEmpty()) { // No rename field: Use source column for results - dsWithGroupCol = dsWithGroupCol.withColumn(sourceField, - functions.col(sourceField).cast(DataTypes.StringType)); + dsWithGroupCol = dsWithGroupCol + .withColumn(sourceField, functions.col(sourceField).cast(DataTypes.StringType)); outputEncoder = RowEncoder.apply(dsWithGroupCol.schema()); - } else { + } + else { // Rename field: Used 'as ', returns StringType final StructType st = dsWithGroupCol.schema().add(renameField, DataTypes.StringType); outputEncoder = RowEncoder.apply(st); } // group dataset by '0', creating one group - KeyValueGroupedDataset keyValueGroupedDs = dsWithGroupCol.groupByKey(( - MapFunction) (r) -> (Integer) r.getAs(groupCol), Encoders.INT()); + KeyValueGroupedDataset keyValueGroupedDs = dsWithGroupCol + .groupByKey((MapFunction) (r) -> (Integer) r.getAs(groupCol), Encoders.INT()); // use flatMapGroupsWithState to retain state between rows; grouping really isn't used here. // IntermediateState is used to retain state of cumulative sum. - Dataset rv = keyValueGroupedDs.flatMapGroupsWithState(this::flatMapGroupsWithStateFunc, - OutputMode.Append(), Encoders.javaSerialization(IntermediateState.class), - outputEncoder, GroupStateTimeout.NoTimeout()); + Dataset rv = keyValueGroupedDs + .flatMapGroupsWithState( + this::flatMapGroupsWithStateFunc, OutputMode.Append(), Encoders + .javaSerialization(IntermediateState.class), + outputEncoder, GroupStateTimeout.NoTimeout() + ); // Return whilst dropping grouping column return rv.drop(groupCol); } - private Iterator flatMapGroupsWithStateFunc(Integer group, Iterator events, - GroupState state) { + private Iterator flatMapGroupsWithStateFunc( + Integer group, + Iterator events, + GroupState state + ) { // Get the previous state if applicable, otherwise initialize state final IntermediateState currentState; if (state.exists()) { currentState = state.get(); - } else { + } + else { currentState = new IntermediateState(); } @@ -138,21 +147,28 @@ private Iterator flatMapGroupsWithStateFunc(Integer group, Iterator ev if (parsedResult.getType().equals(ParsedResult.Type.LONG)) { // got long, accumulate currentState.accumulate(parsedResult.getLong()); - } else if (parsedResult.getType().equals(ParsedResult.Type.DOUBLE)) { + } + else if (parsedResult.getType().equals(ParsedResult.Type.DOUBLE)) { // got double, accumulate currentState.accumulate(parsedResult.getDouble()); - } else { + } + else { // string, skip and return empty skip = true; } // Build new row: First, add already existing fields for (int i = 0; i < r.length(); i++) { - if (renameField.isEmpty() && i==r.fieldIndex(sourceField) && !skip) { + if (renameField.isEmpty() && i == r.fieldIndex(sourceField) && !skip) { // replace old content with cumulative sum if no new field given - rowContents.add(currentState.isLongType() ? - currentState.asLong().toString() : currentState.asDouble().toString()); - } else { + rowContents + .add( + currentState.isLongType() ? currentState.asLong().toString() : currentState + .asDouble() + .toString() + ); + } + else { // return old content if renameField was given or current row is to be skipped rowContents.add(r.get(i)); } @@ -163,9 +179,14 @@ private Iterator flatMapGroupsWithStateFunc(Integer group, Iterator ev if (skip) { // on skip return null rowContents.add(nullValue.value()); - } else { - rowContents.add(currentState.isLongType() ? - currentState.asLong().toString() : currentState.asDouble().toString()); + } + else { + rowContents + .add( + currentState.isLongType() ? currentState.asLong().toString() : currentState + .asDouble() + .toString() + ); } } // Add new row to collection of new rows diff --git a/src/main/java/com/teragrep/pth10/steps/accum/IntermediateState.java b/src/main/java/com/teragrep/pth10/steps/accum/IntermediateState.java index 6aeb521..3e4c28e 100644 --- a/src/main/java/com/teragrep/pth10/steps/accum/IntermediateState.java +++ b/src/main/java/com/teragrep/pth10/steps/accum/IntermediateState.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -48,6 +48,7 @@ import java.io.Serializable; public class IntermediateState implements Serializable { + private static final long serialVersionUID = 1L; private Double aggValueAsDouble; private Long aggValueAsLong; @@ -62,14 +63,14 @@ public IntermediateState() { public void accumulate(long val) { // If long type, accumulate on both as long has less precision compared to double - this.aggValueAsLong+=val; - this.aggValueAsDouble+=val; + this.aggValueAsLong += val; + this.aggValueAsDouble += val; } public void accumulate(double val) { // If double type, accumulate on only double field and set isLongType to false. this.isLongType = false; - this.aggValueAsDouble+=val; + this.aggValueAsDouble += val; } public boolean isLongType() { @@ -80,7 +81,8 @@ public Long asLong() { // allow return only if longs were present with no doubles if (this.isLongType) { return this.aggValueAsLong; - } else { + } + else { throw new IllegalStateException("Can't return long value as double type was used!"); } } @@ -89,7 +91,8 @@ public Double asDouble() { // allow return only if one or more doubles were present if (!this.isLongType) { return this.aggValueAsDouble; - } else { + } + else { throw new IllegalStateException("Can't return double value as long type was used!"); } } @@ -114,4 +117,4 @@ public void setAggValueAsLong(Long aggValueAsLong) { public void setLongType(boolean longType) { isLongType = longType; } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/steps/addtotals/AddtotalsIntermediateState.java b/src/main/java/com/teragrep/pth10/steps/addtotals/AddtotalsIntermediateState.java index 0757ed4..59a0342 100644 --- a/src/main/java/com/teragrep/pth10/steps/addtotals/AddtotalsIntermediateState.java +++ b/src/main/java/com/teragrep/pth10/steps/addtotals/AddtotalsIntermediateState.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -51,12 +51,12 @@ import java.util.HashMap; public class AddtotalsIntermediateState implements Serializable { + private static final long serialVersionUID = 1L; private Map aggValuePerColumn; /** - * Initialize a new empty AddtotalsIntermediateState to use with - * flatMapGroupsWithState + * Initialize a new empty AddtotalsIntermediateState to use with flatMapGroupsWithState */ public AddtotalsIntermediateState() { this.aggValuePerColumn = new HashMap<>(); @@ -64,7 +64,8 @@ public AddtotalsIntermediateState() { /** * Accumulate a long value to index i - * @param i index, should match row index + * + * @param i index, should match row index * @param val long value to add into accumulated sum */ public void accumulate(long i, long val) { @@ -75,7 +76,8 @@ public void accumulate(long i, long val) { /** * Accumulate a double value to index i - * @param i index, should match row index + * + * @param i index, should match row index * @param val double value to add into accumulated sum */ public void accumulate(long i, double val) { @@ -86,6 +88,7 @@ public void accumulate(long i, double val) { /** * Return the accumulated sum as a long, if no doubles were present. + * * @param i column index * @return accumulated sum as long * @throws IllegalStateException if double type was used @@ -93,13 +96,15 @@ public void accumulate(long i, double val) { public Long asLong(long i) { if (aggValuePerColumn.get(i).isLongType()) { return aggValuePerColumn.get(i).asLong(); - } else { + } + else { throw new IllegalStateException("Can't return long value as double type was used!"); } } /** * Return the accumulated sum as a double, if there were any of them present + * * @param i column index * @return accumulated sum as double * @throws IllegalStateException if no double type was used @@ -107,26 +112,30 @@ public Long asLong(long i) { public Double asDouble(long i) { if (!aggValuePerColumn.get(i).isLongType()) { return this.aggValuePerColumn.get(i).asDouble(); - } else { + } + else { throw new IllegalStateException("Can't return double value as long type was used!"); } } /** * Return the accumulated sum as a string + * * @param i column index * @return accumulated sum as string, which can be formatted as an integer or decimal number */ public String asString(long i) { if (aggValuePerColumn.get(i).isLongType()) { return asLong(i).toString(); - } else { + } + else { return asDouble(i).toString(); } } /** * Checks if the given index i exists in the internal mapping + * * @param i column index * @return does the given index i exist in the mapping */ @@ -136,6 +145,7 @@ public boolean exists(long i) { /** * For Spark internal usage. Required for Java Bean compliance. + * * @return internal mapping of accumulated sums per column */ public Map getAggValuePerColumn() { @@ -144,9 +154,13 @@ public Map getAggValuePerColumn() { /** * For Spark internal usage. Required for Java Bean compliance. - *

Spark uses this internally, but the param will be of the type AbstractMap. Need to copy to HashMap to allow for put() operation. - * Otherwise accumulate() method will fail as AbstractMap does not support map manipulation via the put() method. - * Should not be a big deal as the map will contain at best the same amount of entries as the dataset has columns.

+ *

+ * Spark uses this internally, but the param will be of the type AbstractMap. Need to copy to HashMap to allow for + * put() operation. Otherwise accumulate() method will fail as AbstractMap does not support map manipulation via the + * put() method. Should not be a big deal as the map will contain at best the same amount of entries as the dataset + * has columns. + *

+ * * @param aggValuePerColumn map to set as the internal mapping of accumulated sums per column. */ public void setAggValuePerColumn(Map aggValuePerColumn) { diff --git a/src/main/java/com/teragrep/pth10/steps/addtotals/AddtotalsStep.java b/src/main/java/com/teragrep/pth10/steps/addtotals/AddtotalsStep.java index 443980c..2f22fd3 100644 --- a/src/main/java/com/teragrep/pth10/steps/addtotals/AddtotalsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/addtotals/AddtotalsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -59,6 +59,7 @@ import java.util.List; public class AddtotalsStep extends AbstractStep implements Serializable { + public final boolean row; public final boolean col; public final String fieldName; @@ -69,8 +70,15 @@ public class AddtotalsStep extends AbstractStep implements Serializable { private final BatchCollect bc; private final NumericColumnSum numericColumnSum; - public AddtotalsStep(DPLParserCatalystContext catCtx, - boolean row, boolean col, String fieldName, String labelField, String label, List fieldList) { + public AddtotalsStep( + DPLParserCatalystContext catCtx, + boolean row, + boolean col, + String fieldName, + String labelField, + String label, + List fieldList + ) { super(); this.properties.add(CommandProperty.SEQUENTIAL_ONLY); this.properties.add(CommandProperty.USES_INTERNAL_BATCHCOLLECT); @@ -96,7 +104,7 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { /* UDF per column -> sum result to target column -> repeat until all cols iterated through - + col=bool: show extra event at end for each column total row=bool: show extra column at end for each row total fieldList: for which fields, defaults to all @@ -110,7 +118,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { dataset = dataset.withColumn(fieldName, functions.lit(0)); for (String field : dataset.schema().fieldNames()) { if ((fieldList.isEmpty() || fieldList.contains(field)) && !field.equals(fieldName)) { - dataset = dataset.withColumn(fieldName, functions.col(fieldName).plus(addtotalsUDF.apply(functions.col(field)))); + dataset = dataset + .withColumn(fieldName, functions.col(fieldName).plus(addtotalsUDF.apply(functions.col(field)))); } } } @@ -129,14 +138,17 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // Perform the accumulation final Iterator collected = dataset.collectAsList().iterator(); - lastRow = SparkSession.builder().getOrCreate().createDataFrame(numericColumnSum.process(collected), dataset.schema()); + lastRow = SparkSession + .builder() + .getOrCreate() + .createDataFrame(numericColumnSum.process(collected), dataset.schema()); // fieldName takes priority over labelField if the same name if (!fieldName.equals(labelField)) { for (String field : dataset.schema().fieldNames()) { if (field.equals(labelField)) { - dataset = dataset.withColumn(field, functions.when( - functions.col(field).isNull(), functions.lit(""))); + dataset = dataset + .withColumn(field, functions.when(functions.col(field).isNull(), functions.lit(""))); lastRow = lastRow.withColumn(field, functions.lit(label)); break; } diff --git a/src/main/java/com/teragrep/pth10/steps/addtotals/MultiPrecisionValuePair.java b/src/main/java/com/teragrep/pth10/steps/addtotals/MultiPrecisionValuePair.java index 24aadf3..83d0de4 100644 --- a/src/main/java/com/teragrep/pth10/steps/addtotals/MultiPrecisionValuePair.java +++ b/src/main/java/com/teragrep/pth10/steps/addtotals/MultiPrecisionValuePair.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -48,6 +48,7 @@ import java.io.Serializable; public class MultiPrecisionValuePair implements Serializable { + private static final long serialVersionUID = 1L; private long valueLong; private double valueDouble; @@ -104,9 +105,7 @@ public boolean isLongType() { @Override public String toString() { - return "MultiPrecisionValuePair{" + "valueLong=" + valueLong + - ", valueDouble=" + valueDouble + - ", isLongType=" + isLongType + - '}'; + return "MultiPrecisionValuePair{" + "valueLong=" + valueLong + ", valueDouble=" + valueDouble + ", isLongType=" + + isLongType + '}'; } } diff --git a/src/main/java/com/teragrep/pth10/steps/addtotals/NumericColumnSum.java b/src/main/java/com/teragrep/pth10/steps/addtotals/NumericColumnSum.java index ced87e7..1faec72 100644 --- a/src/main/java/com/teragrep/pth10/steps/addtotals/NumericColumnSum.java +++ b/src/main/java/com/teragrep/pth10/steps/addtotals/NumericColumnSum.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -57,11 +57,14 @@ import java.util.List; public class NumericColumnSum implements Serializable { + private static final long serialVersionUID = 1L; private final AddtotalsIntermediateState currentState; + public NumericColumnSum() { this.currentState = new AddtotalsIntermediateState(); } + public List process(Iterator events) { // Perform the cumulative sum aggregation List rv = new ArrayList<>(); @@ -79,7 +82,8 @@ public List process(Iterator events) { final String valueAsString; if (rowItem == null) { valueAsString = ""; - } else { + } + else { valueAsString = rowItem.toString(); } // Parse to LONG or DOUBLE. Others will be STRING and skipped. @@ -88,7 +92,8 @@ public List process(Iterator events) { if (parsedResult.getType().equals(ParsedResult.Type.LONG)) { // got long, accumulate currentState.accumulate(i, parsedResult.getLong()); - } else if (parsedResult.getType().equals(ParsedResult.Type.DOUBLE)) { + } + else if (parsedResult.getType().equals(ParsedResult.Type.DOUBLE)) { // got double, accumulate currentState.accumulate(i, parsedResult.getDouble()); } diff --git a/src/main/java/com/teragrep/pth10/steps/chart/AbstractChartStep.java b/src/main/java/com/teragrep/pth10/steps/chart/AbstractChartStep.java index c3f7b42..ab1f72e 100644 --- a/src/main/java/com/teragrep/pth10/steps/chart/AbstractChartStep.java +++ b/src/main/java/com/teragrep/pth10/steps/chart/AbstractChartStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.chart; import com.teragrep.pth10.steps.AbstractStep; @@ -52,6 +51,7 @@ import java.util.List; public abstract class AbstractChartStep extends AbstractStep { + protected final List listOfExpr; protected final List listOfGroupBy; diff --git a/src/main/java/com/teragrep/pth10/steps/chart/ChartStep.java b/src/main/java/com/teragrep/pth10/steps/chart/ChartStep.java index fb85e69..273b412 100644 --- a/src/main/java/com/teragrep/pth10/steps/chart/ChartStep.java +++ b/src/main/java/com/teragrep/pth10/steps/chart/ChartStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.chart; import org.apache.spark.sql.Column; @@ -74,9 +73,7 @@ public Dataset get(Dataset dataset) { Column mainExpr = listOfExpr.get(0); // skip first one as .agg has strange arguments - Seq seqOfExpr = - JavaConversions.asScalaBuffer(listOfExpr.subList(1, - listOfExpr.size())); + Seq seqOfExpr = JavaConversions.asScalaBuffer(listOfExpr.subList(1, listOfExpr.size())); Seq seqOfGroupBy = JavaConversions.asScalaBuffer(listOfGroupBy); return dataset.groupBy(seqOfGroupBy).agg(mainExpr, seqOfExpr); diff --git a/src/main/java/com/teragrep/pth10/steps/convert/AbstractConvertStep.java b/src/main/java/com/teragrep/pth10/steps/convert/AbstractConvertStep.java index a4762cd..e4dca5a 100644 --- a/src/main/java/com/teragrep/pth10/steps/convert/AbstractConvertStep.java +++ b/src/main/java/com/teragrep/pth10/steps/convert/AbstractConvertStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.convert; import com.teragrep.pth10.steps.AbstractStep; @@ -52,9 +51,11 @@ import java.util.List; public abstract class AbstractConvertStep extends AbstractStep { + protected List listOfCommands = new ArrayList<>(); protected List listOfFieldsToOmit = new ArrayList<>(); protected String timeformat = "%m/%d/%Y %H:%M:%S"; + public AbstractConvertStep() { super(); } diff --git a/src/main/java/com/teragrep/pth10/steps/convert/ConvertCommand.java b/src/main/java/com/teragrep/pth10/steps/convert/ConvertCommand.java index 8ab6e69..73c255f 100644 --- a/src/main/java/com/teragrep/pth10/steps/convert/ConvertCommand.java +++ b/src/main/java/com/teragrep/pth10/steps/convert/ConvertCommand.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,11 +43,13 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.convert; public class ConvertCommand { - public ConvertCommand() { } + + public ConvertCommand() { + } + public enum ConvertCommandType { AUTO, NUM, MKTIME, CTIME, DUR2SEC, MEMK, MSTIME, RMCOMMA, RMUNIT } diff --git a/src/main/java/com/teragrep/pth10/steps/convert/ConvertStep.java b/src/main/java/com/teragrep/pth10/steps/convert/ConvertStep.java index 66bdd31..df1a42a 100644 --- a/src/main/java/com/teragrep/pth10/steps/convert/ConvertStep.java +++ b/src/main/java/com/teragrep/pth10/steps/convert/ConvertStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.convert; import com.teragrep.pth10.ast.commands.transformstatement.convert.*; @@ -58,15 +57,18 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -public final class ConvertStep extends AbstractConvertStep{ +public final class ConvertStep extends AbstractConvertStep { + private static final Logger LOGGER = LoggerFactory.getLogger(ConvertStep.class); private SparkSession sparkSession; + public ConvertStep() { super(); } /** * Perform the | convert command and return result dataset + * * @return resulting dataset after command */ @Override @@ -81,11 +83,14 @@ public Dataset get(Dataset dataset) { // Process all of the convert commands for (ConvertCommand cmd : this.listOfCommands) { - LOGGER.info("Processing convert command <[{}]> using field <[{}]> renamed as <[{}]>",cmd.getCommandType(),cmd.getFieldParam(),cmd.getRenameField()); + LOGGER + .info( + "Processing convert command <[{}]> using field <[{}]> renamed as <[{}]>", + cmd.getCommandType(), cmd.getFieldParam(), cmd.getRenameField() + ); // Get wildcarded fields - List fields = getWildcardFields(cmd.getFieldParam(), - rv.columns(), this.listOfFieldsToOmit); + List fields = getWildcardFields(cmd.getFieldParam(), rv.columns(), this.listOfFieldsToOmit); // Process each field with the given conversion function for (String field : fields) { @@ -129,8 +134,9 @@ public Dataset get(Dataset dataset) { /** * Check for wildcard and omit fields given in none() command if present - * @param wc Wildcard - * @param cols Array of column names present in data + * + * @param wc Wildcard + * @param cols Array of column names present in data * @param omitList List of column names to omit * @return List of column names that fit the wildcard */ @@ -157,10 +163,11 @@ private List getWildcardFields(String wc, String[] cols, List om /** * Process conversion function auto() - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name - * @param cancelOnNull On null value, cancel and don't return + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name + * @param cancelOnNull On null value, cancel and don't return * @return Input dataset with added result column */ private Dataset auto(Dataset dataset, String field, String renameField, boolean cancelOnNull) { @@ -176,9 +183,10 @@ private Dataset auto(Dataset dataset, String field, String renameField /** * Process conversion function auto() with cancelOnNull=true (default struck behaviour) - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name * @return Input dataset with added result column */ private Dataset auto(Dataset dataset, String field, String renameField) { @@ -187,9 +195,10 @@ private Dataset auto(Dataset dataset, String field, String renameField /** * Process conversion function num() - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name * @return Input dataset with added result column */ private Dataset num(Dataset dataset, String field, String renameField) { @@ -198,41 +207,44 @@ private Dataset num(Dataset dataset, String field, String renameField) /** * Process conversion function mktime() - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name * @return Input dataset with added result column */ private Dataset mktime(Dataset dataset, String field, String renameField) { UserDefinedFunction mktimeUDF = functions.udf(new Mktime(), DataTypes.StringType); sparkSession.udf().register("UDF_Mktime", mktimeUDF); - Column udfResult = functions.callUDF("UDF_Mktime", - functions.col(field).cast(DataTypes.StringType), functions.lit(this.timeformat)); + Column udfResult = functions + .callUDF("UDF_Mktime", functions.col(field).cast(DataTypes.StringType), functions.lit(this.timeformat)); return dataset.withColumn(renameField == null ? field : renameField, udfResult); } /** * Process conversion function ctime() - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name * @return Input dataset with added result column */ private Dataset ctime(Dataset dataset, String field, String renameField) { UserDefinedFunction ctimeUDF = functions.udf(new Ctime(), DataTypes.StringType); sparkSession.udf().register("UDF_Ctime", ctimeUDF); - Column udfResult = functions.callUDF("UDF_Ctime", functions.col(field).cast(DataTypes.StringType), - functions.lit(timeformat)); + Column udfResult = functions + .callUDF("UDF_Ctime", functions.col(field).cast(DataTypes.StringType), functions.lit(timeformat)); return dataset.withColumn(renameField == null ? field : renameField, udfResult); } /** * Process conversion function dur2sec() - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name * @return Input dataset with added result column */ private Dataset dur2sec(Dataset dataset, String field, String renameField) { @@ -245,9 +257,10 @@ private Dataset dur2sec(Dataset dataset, String field, String renameFi /** * Process conversion function memk() - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name * @return Input dataset with added result column */ private Dataset memk(Dataset dataset, String field, String renameField) { @@ -260,9 +273,10 @@ private Dataset memk(Dataset dataset, String field, String renameField /** * Process conversion function mstime() - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name * @return Input dataset with added result column */ private Dataset mstime(Dataset dataset, String field, String renameField) { @@ -275,20 +289,23 @@ private Dataset mstime(Dataset dataset, String field, String renameFie /** * Process conversion function rmcomma() - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name * @return Input dataset with added result column */ private Dataset rmcomma(Dataset dataset, String field, String renameField) { - return dataset.withColumn(renameField == null ? field : renameField, functions.regexp_replace(functions.col(field), ",", "")); + return dataset + .withColumn(renameField == null ? field : renameField, functions.regexp_replace(functions.col(field), ",", "")); } /** * Process conversion function rmunit() - * @param dataset Input dataset - * @param field Field, where source data is - * @param renameField AS new-field-name + * + * @param dataset Input dataset + * @param field Field, where source data is + * @param renameField AS new-field-name * @return Input dataset with added result column */ private Dataset rmunit(Dataset dataset, String field, String renameField) { diff --git a/src/main/java/com/teragrep/pth10/steps/dedup/AbstractDedupStep.java b/src/main/java/com/teragrep/pth10/steps/dedup/AbstractDedupStep.java index 1df978c..3c65c87 100644 --- a/src/main/java/com/teragrep/pth10/steps/dedup/AbstractDedupStep.java +++ b/src/main/java/com/teragrep/pth10/steps/dedup/AbstractDedupStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.dedup; import com.teragrep.functions.dpf_02.BatchCollect; @@ -54,6 +53,7 @@ import java.util.Map; public abstract class AbstractDedupStep extends AbstractStep { + protected List listOfFields; protected Map> fieldsProcessed; protected int maxDuplicates; diff --git a/src/main/java/com/teragrep/pth10/steps/dedup/DedupStep.java b/src/main/java/com/teragrep/pth10/steps/dedup/DedupStep.java index ae57bbd..a4b402e 100644 --- a/src/main/java/com/teragrep/pth10/steps/dedup/DedupStep.java +++ b/src/main/java/com/teragrep/pth10/steps/dedup/DedupStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.dedup; import com.teragrep.functions.dpf_02.BatchCollect; @@ -63,9 +62,18 @@ import java.util.stream.Collectors; public final class DedupStep extends AbstractDedupStep { + private static final Logger LOGGER = LoggerFactory.getLogger(DedupStep.class); - public DedupStep(List listOfFields, int maxDuplicates, boolean keepEmpty, boolean keepEvents, boolean consecutive, - DPLParserCatalystContext catCtx, boolean completeOutputMode) { + + public DedupStep( + List listOfFields, + int maxDuplicates, + boolean keepEmpty, + boolean keepEvents, + boolean consecutive, + DPLParserCatalystContext catCtx, + boolean completeOutputMode + ) { super(); this.properties.add(CommandProperty.SEQUENTIAL_ONLY); this.properties.add(CommandProperty.USES_INTERNAL_BATCHCOLLECT); @@ -110,13 +118,14 @@ public Dataset get(Dataset dataset) { if (fieldValueObject == null && !keepEmpty) { output.set(i, nullifyRowField(r, schema, fieldName)); continue;// filter out - } else if (fieldValueObject == null) { + } + else if (fieldValueObject == null) { fieldValue = "null"; - } else { + } + else { fieldValue = fieldValueObject.toString(); } - // consecutive=true // return at end of if clause, because consecutive=true ignores // maxDuplicates @@ -125,9 +134,12 @@ public Dataset get(Dataset dataset) { if (previousRow.get() == null) { //LOGGER.debug("-> Applying row as previous row"); previousRow.set(r); - } else { - final String prevValue = previousRow.get() - .get(previousRow.get().fieldIndex(fieldName)).toString(); + } + else { + final String prevValue = previousRow + .get() + .get(previousRow.get().fieldIndex(fieldName)) + .toString(); if (prevValue.equals(fieldValue)) { //LOGGER.debug("-> Filtering"); output.set(i, nullifyRowField(r, schema, fieldName)); @@ -142,19 +154,22 @@ public Dataset get(Dataset dataset) { if (!fieldsProcessed.get(fieldName).containsKey(fieldValue)) { // specific field value was not encountered yet, add to map fieldsProcessed.get(fieldName).put(fieldValue, 1L); - } else { + } + else { // field:value present in map, check if amount of duplicates is too high long newValue = fieldsProcessed.get(fieldName).get(fieldValue) + 1L; if (newValue > maxDuplicates) { // too many duplicates, filter out output.set(i, nullifyRowField(r, schema, fieldName)); continue; - } else { + } + else { // duplicates within given max value, ok to be present fieldsProcessed.get(fieldName).put(fieldValue, newValue); } } - } else { + } + else { // the field was not encountered yet, add to map final Map newMap = new ConcurrentHashMap<>(); newMap.put(fieldValue, 1L); @@ -181,7 +196,6 @@ else if (fieldValueObject == null) { fieldValue = fieldValueObject.toString(); } - // consecutive=true // return at end of if clause, because consecutive=true ignores // maxDuplicates @@ -192,11 +206,13 @@ else if (fieldValueObject == null) { previousRow.set(r); } else { - final String prevValue = previousRow.get() - .get(previousRow.get().fieldIndex(fieldName)).toString(); + final String prevValue = previousRow + .get() + .get(previousRow.get().fieldIndex(fieldName)) + .toString(); if (prevValue.equals(fieldValue)) { //LOGGER.debug("-> Filtering"); - doNotFilter=false; + doNotFilter = false; } previousRow.set(r); } @@ -215,7 +231,7 @@ else if (fieldValueObject == null) { long newValue = fieldsProcessed.get(fieldName).get(fieldValue) + 1L; if (newValue > maxDuplicates) { // too many duplicates, filter out - doNotFilter=false; + doNotFilter = false; } else { // duplicates within given max value, ok to be present @@ -240,11 +256,12 @@ else if (fieldValueObject == null) { /** * Takes the row and generates a new one with the given field nullified - * @param r row + * + * @param r row * @param fieldName field to nullify * @return row with the field nullified */ - private Row nullifyRowField(final Row r, final StructType schema, String fieldName) { + private Row nullifyRowField(final Row r, final StructType schema, String fieldName) { final List newRowValues = new ArrayList<>(); for (final StructField field : schema.fields()) { @@ -256,7 +273,7 @@ private Row nullifyRowField(final Row r, final StructType schema, String fieldN newRowValues.add(catCtx.nullValue.value()); } else { - throw new IllegalStateException("Field was not nullable! field=<" + field.name() +">"); + throw new IllegalStateException("Field was not nullable! field=<" + field.name() + ">"); } } diff --git a/src/main/java/com/teragrep/pth10/steps/dpl/AbstractDplStep.java b/src/main/java/com/teragrep/pth10/steps/dpl/AbstractDplStep.java index a356f46..fdf8a3d 100644 --- a/src/main/java/com/teragrep/pth10/steps/dpl/AbstractDplStep.java +++ b/src/main/java/com/teragrep/pth10/steps/dpl/AbstractDplStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.dpl; import com.teragrep.pth10.datasources.GeneratedDatasource; @@ -52,9 +51,11 @@ import java.util.List; public abstract class AbstractDplStep extends AbstractStep { + public enum DplCommandType { PARSETREE } + protected DplCommandType commandType = DplCommandType.PARSETREE; protected List lines = null; protected GeneratedDatasource generatedDatasource = null; diff --git a/src/main/java/com/teragrep/pth10/steps/dpl/DplStep.java b/src/main/java/com/teragrep/pth10/steps/dpl/DplStep.java index 0e7e20b..8c66c1c 100644 --- a/src/main/java/com/teragrep/pth10/steps/dpl/DplStep.java +++ b/src/main/java/com/teragrep/pth10/steps/dpl/DplStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,16 +43,17 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.dpl; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; public final class DplStep extends AbstractDplStep { + public DplStep() { super(); } + @Override public Dataset get(Dataset dataset) { if (dataset == null) { diff --git a/src/main/java/com/teragrep/pth10/steps/eval/AbstractEvalStep.java b/src/main/java/com/teragrep/pth10/steps/eval/AbstractEvalStep.java index f662b03..263a434 100644 --- a/src/main/java/com/teragrep/pth10/steps/eval/AbstractEvalStep.java +++ b/src/main/java/com/teragrep/pth10/steps/eval/AbstractEvalStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,13 +43,13 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.eval; import com.teragrep.pth10.steps.AbstractStep; import org.apache.spark.sql.Column; public abstract class AbstractEvalStep extends AbstractStep { + protected String leftSide = null; protected Column rightSide = null; diff --git a/src/main/java/com/teragrep/pth10/steps/eval/EvalStep.java b/src/main/java/com/teragrep/pth10/steps/eval/EvalStep.java index 1941a57..d871f87 100644 --- a/src/main/java/com/teragrep/pth10/steps/eval/EvalStep.java +++ b/src/main/java/com/teragrep/pth10/steps/eval/EvalStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.eval; import org.apache.spark.sql.Dataset; @@ -53,7 +52,8 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -public final class EvalStep extends AbstractEvalStep{ +public final class EvalStep extends AbstractEvalStep { + public EvalStep() { super(); } @@ -88,7 +88,8 @@ public Dataset get(Dataset dataset) { /** * Check if dataset contains a column - * @param schema ds.schema() + * + * @param schema ds.schema() * @param colName column name * @return does it contain the column true/false */ diff --git a/src/main/java/com/teragrep/pth10/steps/eventstats/AbstractEventstatsStep.java b/src/main/java/com/teragrep/pth10/steps/eventstats/AbstractEventstatsStep.java index 6aeeeec..fcc2052 100644 --- a/src/main/java/com/teragrep/pth10/steps/eventstats/AbstractEventstatsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/eventstats/AbstractEventstatsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.eventstats; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -53,6 +52,7 @@ import java.util.List; public abstract class AbstractEventstatsStep extends AbstractStep { + protected List listOfAggregations; protected String byInstruction; protected DPLParserCatalystContext catCtx; diff --git a/src/main/java/com/teragrep/pth10/steps/eventstats/EventstatsStep.java b/src/main/java/com/teragrep/pth10/steps/eventstats/EventstatsStep.java index 2581767..e2a1562 100644 --- a/src/main/java/com/teragrep/pth10/steps/eventstats/EventstatsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/eventstats/EventstatsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.eventstats; import org.apache.spark.sql.*; @@ -64,12 +63,14 @@ import java.util.stream.Collectors; public class EventstatsStep extends AbstractEventstatsStep { + private static final Logger LOGGER = LoggerFactory.getLogger(EventstatsStep.class); public EventstatsStep() { super(); this.properties.add(CommandProperty.AGGREGATE); } + @Override public Dataset get(Dataset dataset) throws StreamingQueryException { // perform aggregation @@ -99,7 +100,14 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { final String checkpointPath = pathForSave + "checkpoint/" + rndId; final String path = pathForSave + "data/" + rndId + ".avro"; - LOGGER.info(String.format("Initializing a stream query for eventstats: name: '%s', Path(avro): '%s', Checkpoint path: '%s'", queryName, path, checkpointPath)); + LOGGER + .info( + String + .format( + "Initializing a stream query for eventstats: name: '%s', Path(avro): '%s', Checkpoint path: '%s'", + queryName, path, checkpointPath + ) + ); // save ds to HDFS, and perform join on that DataStreamWriter writer = dataset @@ -146,7 +154,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { } } - Seq rearranged = JavaConversions.asScalaBuffer(schemaFields.stream().map(functions::col).collect(Collectors.toList())); + Seq rearranged = JavaConversions + .asScalaBuffer(schemaFields.stream().map(functions::col).collect(Collectors.toList())); resultDs = resultDs.select(rearranged); // rearrange to look more like original dataset return resultDs; diff --git a/src/main/java/com/teragrep/pth10/steps/explain/AbstractExplainStep.java b/src/main/java/com/teragrep/pth10/steps/explain/AbstractExplainStep.java index 8f44155..0e75afa 100644 --- a/src/main/java/com/teragrep/pth10/steps/explain/AbstractExplainStep.java +++ b/src/main/java/com/teragrep/pth10/steps/explain/AbstractExplainStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,15 +43,16 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.explain; import com.teragrep.pth10.datasources.GeneratedDatasource; import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractExplainStep extends AbstractStep { + protected ExplainMode mode = ExplainMode.BRIEF; protected GeneratedDatasource generatedDatasource = null; + public enum ExplainMode { BRIEF, EXTENDED } diff --git a/src/main/java/com/teragrep/pth10/steps/explain/ExplainStep.java b/src/main/java/com/teragrep/pth10/steps/explain/ExplainStep.java index 64c61cf..b5e106d 100644 --- a/src/main/java/com/teragrep/pth10/steps/explain/ExplainStep.java +++ b/src/main/java/com/teragrep/pth10/steps/explain/ExplainStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.explain; import org.apache.spark.sql.Dataset; @@ -59,6 +58,7 @@ import java.util.List; public final class ExplainStep extends AbstractExplainStep { + public ExplainStep() { super(); // Has to be in Sequential_Only for queryExecution() (doesn't work with a streaming dataset) @@ -75,14 +75,15 @@ public Dataset get(Dataset dataset) { Dataset rv = null; if (this.mode == ExplainMode.BRIEF) { List rowList = Collections.singletonList(RowFactory.create(dataset.queryExecution().simpleString())); - StructType schema = new StructType(new StructField[]{ + StructType schema = new StructType(new StructField[] { StructField.apply("result", DataTypes.StringType, false, new MetadataBuilder().build()) }); rv = SparkSession.builder().getOrCreate().createDataFrame(rowList, schema); } else if (this.mode == ExplainMode.EXTENDED) { - List rowList = Collections.singletonList(RowFactory.create(dataset.queryExecution().stringWithStats())); - StructType schema = new StructType(new StructField[]{ + List rowList = Collections + .singletonList(RowFactory.create(dataset.queryExecution().stringWithStats())); + StructType schema = new StructType(new StructField[] { StructField.apply("result", DataTypes.StringType, false, new MetadataBuilder().build()) }); rv = SparkSession.builder().getOrCreate().createDataFrame(rowList, schema); diff --git a/src/main/java/com/teragrep/pth10/steps/fields/AbstractFieldsStep.java b/src/main/java/com/teragrep/pth10/steps/fields/AbstractFieldsStep.java index cabdb6a..3823375 100644 --- a/src/main/java/com/teragrep/pth10/steps/fields/AbstractFieldsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/fields/AbstractFieldsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.fields; import com.teragrep.pth10.steps.AbstractStep; @@ -52,6 +51,7 @@ import java.util.List; public abstract class AbstractFieldsStep extends AbstractStep { + public enum FieldMode { KEEP_FIELDS, REMOVE_FIELDS } diff --git a/src/main/java/com/teragrep/pth10/steps/fields/FieldsStep.java b/src/main/java/com/teragrep/pth10/steps/fields/FieldsStep.java index 507b394..70aedb4 100644 --- a/src/main/java/com/teragrep/pth10/steps/fields/FieldsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/fields/FieldsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,14 +43,14 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.fields; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import scala.collection.JavaConversions; -public final class FieldsStep extends AbstractFieldsStep{ +public final class FieldsStep extends AbstractFieldsStep { + public FieldsStep() { super(); } diff --git a/src/main/java/com/teragrep/pth10/steps/fillnull/AbstractFillnullStep.java b/src/main/java/com/teragrep/pth10/steps/fillnull/AbstractFillnullStep.java index ed9f2f6..c674562 100644 --- a/src/main/java/com/teragrep/pth10/steps/fillnull/AbstractFillnullStep.java +++ b/src/main/java/com/teragrep/pth10/steps/fillnull/AbstractFillnullStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -52,9 +52,11 @@ import java.util.List; public abstract class AbstractFillnullStep extends AbstractStep { + protected NullValue nullValue; protected List listOfFields = new ArrayList<>(); protected String fillerString = "0"; + public void setFillerString(String fillerString) { this.fillerString = fillerString; } diff --git a/src/main/java/com/teragrep/pth10/steps/fillnull/FillnullStep.java b/src/main/java/com/teragrep/pth10/steps/fillnull/FillnullStep.java index d9e1465..548aa02 100644 --- a/src/main/java/com/teragrep/pth10/steps/fillnull/FillnullStep.java +++ b/src/main/java/com/teragrep/pth10/steps/fillnull/FillnullStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -54,7 +54,9 @@ import java.util.Arrays; public class FillnullStep extends AbstractFillnullStep { + private static final Logger LOGGER = LoggerFactory.getLogger(FillnullStep.class); + @Override public Dataset get(Dataset dataset) { if (listOfFields.isEmpty()) { @@ -67,15 +69,17 @@ public Dataset get(Dataset dataset) { if (checkForFieldsExistence(field, dataset.columns())) { // field exists // replace all "" (empty string) fields with fillerString - dataset = dataset.withColumn(field, - functions.when( // if field="" return fillerString - functions.col(field).equalTo(functions.lit(nullValue.value())), - functions.lit(fillerString)) - .otherwise( // else if field=null return fillerString - functions.when( - functions.col(field).isNull(),functions.lit(fillerString)) - .otherwise(functions.col(field)))); // else return field - } else { + dataset = dataset + .withColumn(field, functions.when( // if field="" return fillerString + functions.col(field).equalTo(functions.lit(nullValue.value())), functions.lit(fillerString) + ) + .otherwise( + // else if field=null return fillerString + functions.when(functions.col(field).isNull(), functions.lit(fillerString)).otherwise(functions.col(field)) + ) + ); // else return field + } + else { // field does not exist, create it and fill with fillerString dataset = dataset.withColumn(field, functions.lit(fillerString)); } diff --git a/src/main/java/com/teragrep/pth10/steps/format/AbstractFormatStep.java b/src/main/java/com/teragrep/pth10/steps/format/AbstractFormatStep.java index 1b71dc2..751e6c6 100644 --- a/src/main/java/com/teragrep/pth10/steps/format/AbstractFormatStep.java +++ b/src/main/java/com/teragrep/pth10/steps/format/AbstractFormatStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -48,6 +48,7 @@ import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractFormatStep extends AbstractStep { + protected String mvSep = "OR"; protected int maxResults = 0; protected String rowPrefix = "("; diff --git a/src/main/java/com/teragrep/pth10/steps/format/FormatStep.java b/src/main/java/com/teragrep/pth10/steps/format/FormatStep.java index b3c2188..6f83e7e 100644 --- a/src/main/java/com/teragrep/pth10/steps/format/FormatStep.java +++ b/src/main/java/com/teragrep/pth10/steps/format/FormatStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -67,6 +67,7 @@ public FormatStep() { super(); this.properties.add(CommandProperty.AGGREGATE); } + @Override public Dataset get(Dataset dataset) { // make variables that are in mapFunction final. Otherwise, they will be the initial default values. @@ -80,7 +81,8 @@ public Dataset get(Dataset dataset) { // maxResults=0 does not limit and maxResults<0 is invalid if (maxResults > 0) { mappedDs = mappedDs.limit(this.maxResults); - } else if (maxResults < 0) { + } + else if (maxResults < 0) { throw new IllegalArgumentException("Expected a non-negative integer value for 'maxresults' parameter."); } @@ -90,8 +92,12 @@ public Dataset get(Dataset dataset) { strBuilder.append(colPrefix); strBuilder.append(' '); for (int j = 0; j < r.schema().length(); j++) { - if (r.schema().fields()[j].dataType().typeName().equals( - DataTypes.createArrayType(DataTypes.StringType).typeName())) { + if ( + r.schema().fields()[j] + .dataType() + .typeName() + .equals(DataTypes.createArrayType(DataTypes.StringType).typeName()) + ) { // MV field // ( col="value1" OR col="value2" OR col="value3" ) List mvField = r.getList(j); @@ -103,20 +109,21 @@ public Dataset get(Dataset dataset) { strBuilder.append("=\""); strBuilder.append(mvField.get(k)); strBuilder.append("\""); - if (k != mvField.size()-1){ + if (k != mvField.size() - 1) { // Do not append ' OR ' on last mvField cell strBuilder.append(" ".concat(mvSep).concat(" ")); } } // ' ) ' strBuilder.append(" ) "); - } else { + } + else { // 'col="value"' strBuilder.append(r.schema().fields()[j].name()); strBuilder.append("=\""); strBuilder.append(r.getAs(r.fieldIndex(r.schema().fields()[j].name())).toString()); strBuilder.append("\""); - if (j != r.schema().fields().length-1) { + if (j != r.schema().fields().length - 1) { // ' AND ' strBuilder.append(' '); strBuilder.append(colSep); @@ -124,7 +131,6 @@ public Dataset get(Dataset dataset) { strBuilder.append(' '); } - } // ') ' @@ -132,16 +138,18 @@ public Dataset get(Dataset dataset) { strBuilder.append(' '); return RowFactory.create(strBuilder.toString()); - }, RowEncoder.apply(new StructType(new StructField[]{StructField.apply("search", DataTypes.StringType, false, new MetadataBuilder().build())}))); - + }, RowEncoder.apply(new StructType(new StructField[] { + StructField.apply("search", DataTypes.StringType, false, new MetadataBuilder().build()) + }))); - Seq concatRows = JavaConversions.asScalaBuffer(Arrays.asList( - functions.lit(this.rowPrefix.concat(" ")), // '( ' - functions.concat_ws(this.rowSep.concat(" "), // 'OR ' - functions.collect_list("search")), // cols - functions.lit(this.rowSuffix))); // ')' + Seq concatRows = JavaConversions + .asScalaBuffer(Arrays.asList(functions.lit(this.rowPrefix.concat(" ")), // '( ' + functions.concat_ws(this.rowSep.concat(" "), // 'OR ' + functions.collect_list("search") + ), // cols + functions.lit(this.rowSuffix) + )); // ')' - return mappedDs.agg( - functions.concat(concatRows).as("search")); + return mappedDs.agg(functions.concat(concatRows).as("search")); } } diff --git a/src/main/java/com/teragrep/pth10/steps/iplocation/AbstractIplocationStep.java b/src/main/java/com/teragrep/pth10/steps/iplocation/AbstractIplocationStep.java index d451c48..e73fcc1 100644 --- a/src/main/java/com/teragrep/pth10/steps/iplocation/AbstractIplocationStep.java +++ b/src/main/java/com/teragrep/pth10/steps/iplocation/AbstractIplocationStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.iplocation; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -53,6 +52,7 @@ import java.util.List; public abstract class AbstractIplocationStep extends AbstractStep { + protected String lang = "en"; protected String field = null; // required protected boolean allFields = false; @@ -61,7 +61,8 @@ public abstract class AbstractIplocationStep extends AbstractStep { protected String pathToDb = null; protected final String internalMapColumnName = "$$dpl_pth10_internal_iplocation_column$$"; protected final List columnsMinimal = Arrays.asList("country", "lat", "lon", "region", "city"); - protected final List columnsFull = Arrays.asList("country", "lat", "lon", "metroCode", "continent", "city", "region"); + protected final List columnsFull = Arrays + .asList("country", "lat", "lon", "metroCode", "continent", "city", "region"); protected final List columnsRirData = Arrays.asList("country", "operator"); protected final List columnsCountryData = Arrays.asList("country", "continent"); diff --git a/src/main/java/com/teragrep/pth10/steps/iplocation/IplocationStep.java b/src/main/java/com/teragrep/pth10/steps/iplocation/IplocationStep.java index f2ee8f3..3eb1729 100644 --- a/src/main/java/com/teragrep/pth10/steps/iplocation/IplocationStep.java +++ b/src/main/java/com/teragrep/pth10/steps/iplocation/IplocationStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.iplocation; import com.teragrep.pth10.ast.commands.transformstatement.iplocation.IplocationGeoIPDataMapper; @@ -60,11 +59,13 @@ import java.util.Map; /** - * Uses a GeoIP2 or rir-data MaxMind database to map IP addresses to location information, - * such as latitude, longitude, city, region, country, metro code and et cetera. + * Uses a GeoIP2 or rir-data MaxMind database to map IP addresses to location information, such as latitude, longitude, + * city, region, country, metro code and et cetera. */ -public final class IplocationStep extends AbstractIplocationStep{ +public final class IplocationStep extends AbstractIplocationStep { + private static final Logger LOGGER = LoggerFactory.getLogger(IplocationStep.class); + public IplocationStep() { super(); } @@ -94,24 +95,37 @@ public Dataset get(Dataset dataset) { if (isGeoIPDatabase) { LOGGER.info("Detected GeoIP database"); - udf = functions.udf( - new IplocationGeoIPDataMapper(this.pathToDb, this.catCtx.nullValue, - extractMapFromHadoopCfg(this.catCtx.getSparkSession().sparkContext().hadoopConfiguration())), - DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType, true)); + udf = functions + .udf( + new IplocationGeoIPDataMapper( + this.pathToDb, + this.catCtx.nullValue, + extractMapFromHadoopCfg( + this.catCtx.getSparkSession().sparkContext().hadoopConfiguration() + ) + ), DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType, true) + ); } else { LOGGER.info("Detected rir database"); - udf = functions.udf(new IplocationRirDataMapper(this.pathToDb, this.catCtx.nullValue, - extractMapFromHadoopCfg(this.catCtx.getSparkSession().sparkContext().hadoopConfiguration())), - DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType, true)); + udf = functions + .udf( + new IplocationRirDataMapper( + this.pathToDb, + this.catCtx.nullValue, + extractMapFromHadoopCfg( + this.catCtx.getSparkSession().sparkContext().hadoopConfiguration() + ) + ), DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType, true) + ); } this.catCtx.getSparkSession().udf().register("UDF_IPLocation", udf); // Run udf - Column udfResult = functions.callUDF("UDF_IPLocation", - functions.col(field), functions.lit(lang), functions.lit(true)); + Column udfResult = functions + .callUDF("UDF_IPLocation", functions.col(field), functions.lit(lang), functions.lit(true)); // Different columns based on allfields parameter and database type List mapKeys; @@ -149,8 +163,9 @@ else if (isGeoIPDatabase) { } /** - * Extracts the inner key-value map of a Hadoop configuration, allowing it to be used - * in a user-defined function, as the Configuration item itself is not Serializable. + * Extracts the inner key-value map of a Hadoop configuration, allowing it to be used in a user-defined function, as + * the Configuration item itself is not Serializable. + * * @param hadoopCfg Hadoop configuration object * @return String, String mapping of the inner config */ diff --git a/src/main/java/com/teragrep/pth10/steps/join/AbstractJoinStep.java b/src/main/java/com/teragrep/pth10/steps/join/AbstractJoinStep.java index 7a02c4b..bfb761d 100644 --- a/src/main/java/com/teragrep/pth10/steps/join/AbstractJoinStep.java +++ b/src/main/java/com/teragrep/pth10/steps/join/AbstractJoinStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.join; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -55,6 +54,7 @@ import java.util.List; public abstract class AbstractJoinStep extends AbstractStep { + protected String joinMode = null; protected Boolean usetime = null; protected Boolean earlier = null; diff --git a/src/main/java/com/teragrep/pth10/steps/join/JoinStep.java b/src/main/java/com/teragrep/pth10/steps/join/JoinStep.java index ae75d59..5bf709b 100644 --- a/src/main/java/com/teragrep/pth10/steps/join/JoinStep.java +++ b/src/main/java/com/teragrep/pth10/steps/join/JoinStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.join; import com.teragrep.pth10.steps.subsearch.AbstractSubsearchStep; @@ -62,7 +61,9 @@ import java.util.UUID; public final class JoinStep extends AbstractJoinStep { + private static final Logger LOGGER = LoggerFactory.getLogger(JoinStep.class); + public JoinStep() { super(); } @@ -74,7 +75,6 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { return null; } - // prepare subsearchStep and get the dataset this.subsearchStep.setListener(this.catCtx.getInternalStreamingQueryListener()); this.subsearchStep.setHdfsPath(this.pathForSubsearchSave); @@ -104,23 +104,24 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // Create subsearch to disk writer and start query final StructType subSchema = convertedSubSearchDataset.schema(); - DataStreamWriter subToDiskWriter = - convertedSubSearchDataset - .writeStream() - .format("avro") - .trigger(Trigger.ProcessingTime(0)) - // .option("spark.cleaner.referenceTracking.cleanCheckpoints", "true") - .option("checkpointLocation", checkpointPath) - .option("path", path) - .outputMode("append"); + DataStreamWriter subToDiskWriter = convertedSubSearchDataset + .writeStream() + .format("avro") + .trigger(Trigger.ProcessingTime(0)) + // .option("spark.cleaner.referenceTracking.cleanCheckpoints", "true") + .option("checkpointLocation", checkpointPath) + .option("path", path) + .outputMode("append"); // Use StreamingQueryListener to stop query when no progress is detected - StreamingQuery subToDiskQuery = this.getCatCtx().getInternalStreamingQueryListener().registerQuery(queryName, subToDiskWriter); + StreamingQuery subToDiskQuery = this + .getCatCtx() + .getInternalStreamingQueryListener() + .registerQuery(queryName, subToDiskWriter); // Await for StreamingQueryListener to call stop() subToDiskQuery.awaitTermination(); - // Read from disk to dataframe Dataset readFromDisk = ss.sqlContext().read().format("avro").schema(subSchema).load(path); // retrieve original column names @@ -168,10 +169,20 @@ else if (max != null) { // Check that join on field is present on both datasets if (Arrays.stream(dataset.schema().fields()).noneMatch(x -> x.name().equals(fieldName))) { - throw new RuntimeException("Join command encountered an error: main dataset (left side) missing expected field '" + fieldName + "'"); + throw new RuntimeException( + "Join command encountered an error: main dataset (left side) missing expected field '" + + fieldName + "'" + ); } - else if (Arrays.stream(out.schema().fields()).noneMatch(x -> x.name().equals(subSearchPrefix.concat(fieldName)))){ - throw new RuntimeException("Join command encountered an error: Subsearch dataset (right side) missing expected field '" + fieldName + "'"); + else if ( + Arrays + .stream(out.schema().fields()) + .noneMatch(x -> x.name().equals(subSearchPrefix.concat(fieldName))) + ) { + throw new RuntimeException( + "Join command encountered an error: Subsearch dataset (right side) missing expected field '" + + fieldName + "'" + ); } if (joinExpr == null) { @@ -181,11 +192,11 @@ else if (Arrays.stream(out.schema().fields()).noneMatch(x -> x.name().equals(sub joinExpr = joinExpr.and(dataset.col(fieldName).equalTo(out.col(subSearchPrefix + fieldName))); } } - } else { + } + else { throw new IllegalStateException("Join command was not provided with the necessary field(s) to join on!"); } - // If parameters usetime=true, earlier=true if (usetime != null && usetime && earlier != null && earlier) { LOGGER.info("usetime=true, earlier=true (with joinExpr)"); @@ -211,8 +222,12 @@ else if (usetime != null && usetime && earlier != null && !earlier) { // and use coalesce to overwrite if (overwrite != null && overwrite) { for (String colName : originalLeftSideCols) { - if (Arrays.toString(originalRightSideCols).contains(subSearchPrefix + colName) && !listOfFields.contains(colName)) { - result = result.withColumn(colName, functions.coalesce(functions.col(colName), functions.col(subSearchPrefix + colName))).drop(subSearchPrefix + colName); + if ( + Arrays.toString(originalRightSideCols).contains(subSearchPrefix + colName) + && !listOfFields.contains(colName) + ) { + result = result + .withColumn(colName, functions.coalesce(functions.col(colName), functions.col(subSearchPrefix + colName))).drop(subSearchPrefix + colName); } } } diff --git a/src/main/java/com/teragrep/pth10/steps/logicalCatalyst/LogicalCatalystStep.java b/src/main/java/com/teragrep/pth10/steps/logicalCatalyst/LogicalCatalystStep.java index dbadcbf..a8a8fd1 100644 --- a/src/main/java/com/teragrep/pth10/steps/logicalCatalyst/LogicalCatalystStep.java +++ b/src/main/java/com/teragrep/pth10/steps/logicalCatalyst/LogicalCatalystStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.logicalCatalyst; import com.teragrep.pth10.steps.AbstractStep; @@ -71,7 +70,8 @@ public Column getFilterColumn() { public Dataset get(Dataset dataset) { if (dataset != null) { return dataset.where(this.filterColumn); - } else { + } + else { throw new RuntimeException("LogicalCatalystStep got a null dataset!"); } } diff --git a/src/main/java/com/teragrep/pth10/steps/logicalXML/LogicalXMLStep.java b/src/main/java/com/teragrep/pth10/steps/logicalXML/LogicalXMLStep.java index be8ca89..e5d9469 100644 --- a/src/main/java/com/teragrep/pth10/steps/logicalXML/LogicalXMLStep.java +++ b/src/main/java/com/teragrep/pth10/steps/logicalXML/LogicalXMLStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.logicalXML; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -90,7 +89,11 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { if (this.catCtx != null && this.catCtx.getConfig() != null && !this.catCtx.getTestingMode()) { // Perform archive query if (!this.archiveQuery.isStub) { - LOGGER.info("Constructing data stream with query=<{}> metadata=<{}>", this.archiveQuery, isMetadataQuery); + LOGGER + .info( + "Constructing data stream with query=<{}> metadata=<{}>", this.archiveQuery, + isMetadataQuery + ); DPLDatasource datasource = new DPLDatasource(catCtx); ds = datasource.constructStreams(this.archiveQuery, isMetadataQuery); LOGGER.info("Received dataset with columns: <{}>", Arrays.toString(ds.columns())); @@ -104,10 +107,12 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // Testing mode? if (catCtx != null && catCtx.getDs() != null && !archiveQuery.isStub) { ds = catCtx.getDs(); - } else if (catCtx != null && archiveQuery.isStub) { + } + else if (catCtx != null && archiveQuery.isStub) { // generate empty dataset even in testing mode if no archive query was generated ds = new GeneratedDatasource(catCtx).constructEmptyStream(); - } else { + } + else { throw new RuntimeException("CatCtx didn't have a config and it's dataset is null as well!"); } } diff --git a/src/main/java/com/teragrep/pth10/steps/makeresults/AbstractMakeresultsStep.java b/src/main/java/com/teragrep/pth10/steps/makeresults/AbstractMakeresultsStep.java index cb11188..c181f52 100644 --- a/src/main/java/com/teragrep/pth10/steps/makeresults/AbstractMakeresultsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/makeresults/AbstractMakeresultsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.makeresults; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -53,6 +52,7 @@ import java.util.List; public abstract class AbstractMakeresultsStep extends AbstractStep { + protected int count = 1; protected List serverGroups = new ArrayList<>(); protected String server = null; diff --git a/src/main/java/com/teragrep/pth10/steps/makeresults/MakeresultsStep.java b/src/main/java/com/teragrep/pth10/steps/makeresults/MakeresultsStep.java index de7cd3f..196451a 100644 --- a/src/main/java/com/teragrep/pth10/steps/makeresults/MakeresultsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/makeresults/MakeresultsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.makeresults; import org.apache.spark.sql.*; @@ -68,10 +67,12 @@ import java.util.ArrayList; import java.util.List; -public final class MakeresultsStep extends AbstractMakeresultsStep{ +public final class MakeresultsStep extends AbstractMakeresultsStep { + public MakeresultsStep() { super(); } + @Override public Dataset get(Dataset dataset) throws StreamingQueryException { /*if (dataset == null) { @@ -81,26 +82,20 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // change schema based on annotate parameter StructType schema; if (annotate) { - schema = - new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("struck_server", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("struck_server_group", DataTypes.StringType, true, new MetadataBuilder().build()) - } - ); + schema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("struck_server", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("struck_server_group", DataTypes.StringType, true, new MetadataBuilder().build()) + }); } else { - schema = - new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()) - } - ); + schema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()) + }); } // make a streaming dataset @@ -110,12 +105,13 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { MemoryStream rowMemoryStream = new MemoryStream<>(1, sqlCtx, Option.apply(1), encoder); Dataset generated = rowMemoryStream.toDS(); - final String queryName = "makeresults_" + ((int)(Math.random() * 100000)); + final String queryName = "makeresults_" + ((int) (Math.random() * 100000)); - DataStreamWriter makeResultsWriter = generated. - writeStream().outputMode("append").format("memory"); + DataStreamWriter makeResultsWriter = generated.writeStream().outputMode("append").format("memory"); - StreamingQuery makeResultsQuery = this.catCtx.getInternalStreamingQueryListener().registerQuery(queryName, makeResultsWriter); + StreamingQuery makeResultsQuery = this.catCtx + .getInternalStreamingQueryListener() + .registerQuery(queryName, makeResultsWriter); // add row $count times rowMemoryStream.addData(makeRows(count, annotate)); @@ -128,7 +124,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { /** * Make one row $amount times and return as {@literal Seq}
* Uses system default timezone - * @param amount How many times each row should be repeated? + * + * @param amount How many times each row should be repeated? * @param annotate Add more columns in addition to '_time'? * @return scala sequence of Rows */ @@ -137,20 +134,12 @@ private Seq makeRows(int amount, boolean annotate) { Row row; if (annotate) { - row = RowFactory.create( - Timestamp.valueOf(LocalDateTime.ofInstant(Instant.now(), ZoneOffset.systemDefault())), - catCtx.nullValue.value(), - catCtx.nullValue.value(), - catCtx.nullValue.value(), - catCtx.nullValue.value(), - catCtx.nullValue.value(), - catCtx.nullValue.value() - ); + row = RowFactory + .create(Timestamp.valueOf(LocalDateTime.ofInstant(Instant.now(), ZoneOffset.systemDefault())), catCtx.nullValue.value(), catCtx.nullValue.value(), catCtx.nullValue.value(), catCtx.nullValue.value(), catCtx.nullValue.value(), catCtx.nullValue.value()); } else { - row = RowFactory.create( - Timestamp.valueOf(LocalDateTime.ofInstant(Instant.now(), ZoneOffset.systemDefault())) - ); + row = RowFactory + .create(Timestamp.valueOf(LocalDateTime.ofInstant(Instant.now(), ZoneOffset.systemDefault()))); } while (amount > 0) { diff --git a/src/main/java/com/teragrep/pth10/steps/predict/AbstractPredictStep.java b/src/main/java/com/teragrep/pth10/steps/predict/AbstractPredictStep.java index 5356483..428c540 100644 --- a/src/main/java/com/teragrep/pth10/steps/predict/AbstractPredictStep.java +++ b/src/main/java/com/teragrep/pth10/steps/predict/AbstractPredictStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.predict; import com.teragrep.pth10.steps.AbstractStep; @@ -52,9 +51,11 @@ import java.util.List; public abstract class AbstractPredictStep extends AbstractStep { + public enum Algorithm { LL, LLT, LLP, LLP5, LLB, BILL } + protected Algorithm algorithm; protected List listOfColumnsToPredict; protected String correlateField; @@ -66,6 +67,7 @@ public enum Algorithm { protected int lower; protected String upperField; protected String lowerField; + public AbstractPredictStep() { super(); } diff --git a/src/main/java/com/teragrep/pth10/steps/predict/PredictStep.java b/src/main/java/com/teragrep/pth10/steps/predict/PredictStep.java index 4bc2039..fb5572a 100644 --- a/src/main/java/com/teragrep/pth10/steps/predict/PredictStep.java +++ b/src/main/java/com/teragrep/pth10/steps/predict/PredictStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.predict; import org.apache.spark.sql.*; @@ -58,7 +57,9 @@ import java.util.List; public final class PredictStep extends AbstractPredictStep { + private static final Logger LOGGER = LoggerFactory.getLogger(PredictStep.class); + public PredictStep() { super(); this.properties.add(CommandProperty.SEQUENTIAL_ONLY); @@ -78,7 +79,10 @@ public Dataset get(Dataset dataset) { case LLT: return llt(dataset); default: - throw new IllegalArgumentException("Algorithm '" + this.algorithm + "' is not yet supported by the predict command. Use 'LL' (default) or 'LLT' instead."); + throw new IllegalArgumentException( + "Algorithm '" + this.algorithm + + "' is not yet supported by the predict command. Use 'LL' (default) or 'LLT' instead." + ); } } @@ -97,7 +101,7 @@ private double avg(List lr) { sum += val; } - return sum/count; + return sum / count; } private Dataset llt(Dataset dataset) { @@ -108,16 +112,16 @@ private Dataset llt(Dataset dataset) { int indexOfAliasBeginningQuote = predictCol.toString().indexOf(" AS "); String predictFieldName = "prediction(" + predictCol + ")"; if (indexOfAliasBeginningQuote != -1) { - predictFieldName = predictCol.toString().substring(indexOfAliasBeginningQuote+4); + predictFieldName = predictCol.toString().substring(indexOfAliasBeginningQuote + 4); } // upper/lower confidence interval column naming // default: upperXX(predictField) // customized: abc(predictField) - String upperFieldName = this.upperField == null ? ("upper" + this.upper + "(" + predictFieldName + ")") - : (this.upperField + "(" + predictFieldName + ")"); - String lowerFieldName = this.lowerField == null ? ("lower" + this.lower + "(" + predictFieldName + ")") - : (this.lowerField + "(" + predictFieldName + ")"); + String upperFieldName = this.upperField == null ? ("upper" + this.upper + "(" + predictFieldName + + ")") : (this.upperField + "(" + predictFieldName + ")"); + String lowerFieldName = this.lowerField == null ? ("lower" + this.lower + "(" + predictFieldName + + ")") : (this.lowerField + "(" + predictFieldName + ")"); //label = to predict; "count" //feature = _time @@ -147,8 +151,8 @@ private Dataset llt(Dataset dataset) { double W2 = 1.271; // slope eq. error guess // confidence intervals - double upperMultiplier = (this.upper/100d) + 1d; - double lowerMultiplier = (this.lower/100d) + 1d; + double upperMultiplier = (this.upper / 100d) + 1d; + double lowerMultiplier = (this.lower / 100d) + 1d; // list of rows of predictions List listOfPredRows = new ArrayList<>(); @@ -156,42 +160,42 @@ private Dataset llt(Dataset dataset) { // initial values Q[0] = 2.0; v[0] = 0.02; //initial slope - K[0] = Q[0]/(Q[0]+e); + K[0] = Q[0] / (Q[0] + e); double mu_0 = avg(y); - mu[0] = mu_0+ v[0] + K[0]*(toDbl(y.get(0).get(0))-mu_0); + mu[0] = mu_0 + v[0] + K[0] * (toDbl(y.get(0).get(0)) - mu_0); - CI_upper[0] = mu[0]+upperMultiplier*Math.sqrt(Q[0]); - CI_lower[0] = mu[0]-lowerMultiplier*Math.sqrt(Q[0]); + CI_upper[0] = mu[0] + upperMultiplier * Math.sqrt(Q[0]); + CI_lower[0] = mu[0] - lowerMultiplier * Math.sqrt(Q[0]); // add first predicted row - listOfPredRows.add(RowFactory.create(y.get(0).getTimestamp(1),toDbl(y.get(0).get(0)), mu[0], CI_upper[0], CI_lower[0])); + listOfPredRows + .add(RowFactory.create(y.get(0).getTimestamp(1), toDbl(y.get(0).get(0)), mu[0], CI_upper[0], CI_lower[0])); for (int t = 1; t < n; t++) { // update measurement - mu[t] = mu[t-1]+v[t-1]+K[t-1]*(toDbl(y.get(t-1).get(0))-mu[t-1]); - v[t] = v[t-1] + W2; - Q[t] = (1-K[t-1])*Q[t-1]+W1; - K[t] = Q[t]/(Q[t]+e); + mu[t] = mu[t - 1] + v[t - 1] + K[t - 1] * (toDbl(y.get(t - 1).get(0)) - mu[t - 1]); + v[t] = v[t - 1] + W2; + Q[t] = (1 - K[t - 1]) * Q[t - 1] + W1; + K[t] = Q[t] / (Q[t] + e); - CI_upper[t] = mu[t]+upperMultiplier*Math.sqrt(Q[t]); - CI_lower[t] = mu[t]-lowerMultiplier*Math.sqrt(Q[t]); + CI_upper[t] = mu[t] + upperMultiplier * Math.sqrt(Q[t]); + CI_lower[t] = mu[t] - lowerMultiplier * Math.sqrt(Q[t]); //System.out.printf("mu: %s, Q: %s, K: %s, CI-U: %s, CI-L: %s%n", // mu[t], Q[t], K[t], CI_upper[t], CI_lower[t]); - listOfPredRows.add(RowFactory.create(y.get(t).getTimestamp(1), toDbl(y.get(t).get(0)), mu[t], CI_upper[t], CI_lower[t])); + listOfPredRows + .add(RowFactory.create(y.get(t).getTimestamp(1), toDbl(y.get(t).get(0)), mu[t], CI_upper[t], CI_lower[t])); } // generate dataframe from predictions - final StructType sch = new StructType( - new StructField[]{ - StructField.apply("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - StructField.apply("y", DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply(predictFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply(upperFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply(lowerFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()) - } - ); + final StructType sch = new StructType(new StructField[] { + StructField.apply("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + StructField.apply("y", DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply(predictFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply(upperFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply(lowerFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()) + }); Dataset rv; if (SparkSession.getActiveSession().nonEmpty()) { @@ -203,8 +207,7 @@ private Dataset llt(Dataset dataset) { // join predictions with original dataset rv = ds.join(rv, "_time"); - rv = rv.orderBy(functions.col("_time").asc()).drop( "y"); - + rv = rv.orderBy(functions.col("_time").asc()).drop("y"); // future forecasting: list of forecasts List listOfForecasts = new ArrayList<>(); @@ -229,39 +232,40 @@ private Dataset llt(Dataset dataset) { double[] CI_l = initArrayWithValue(0, f); // add initial forecast - mu_f[0] = mu[mu.length-1]+v[v.length-1]+K[K.length-1]*(toDbl(y.get(y.size()-1).get(0))-mu[mu.length-1]); - v_f[0] = v[v.length-1]+W2; + mu_f[0] = mu[mu.length - 1] + v[v.length - 1] + + K[K.length - 1] * (toDbl(y.get(y.size() - 1).get(0)) - mu[mu.length - 1]); + v_f[0] = v[v.length - 1] + W2; - Q_f[0] = (1-K[K.length-1])*Q[Q.length-1]+W1; - K_f[0] = Q_f[0]/((Q_f[0])+e); - CI_u[0] = mu_f[0]+upperMultiplier*Math.sqrt(Q_f[0]); - CI_l[0] = mu_f[0]-lowerMultiplier*Math.sqrt(Q_f[0]); + Q_f[0] = (1 - K[K.length - 1]) * Q[Q.length - 1] + W1; + K_f[0] = Q_f[0] / ((Q_f[0]) + e); + CI_u[0] = mu_f[0] + upperMultiplier * Math.sqrt(Q_f[0]); + CI_l[0] = mu_f[0] - lowerMultiplier * Math.sqrt(Q_f[0]); - listOfForecasts.add(RowFactory.create(Timestamp.from(Instant.ofEpochMilli(last)),null, mu_f[0], CI_u[0], CI_l[0])); + listOfForecasts + .add(RowFactory.create(Timestamp.from(Instant.ofEpochMilli(last)), null, mu_f[0], CI_u[0], CI_l[0])); for (int t = 1; t < f; t++) { //System.out.println("t(2)= " + t); // measurement update - mu_f[t] = mu_f[t-1]+v_f[t-1]+K_f[t-1]*(toDbl(y.get(y.size()-1).get(0))-mu_f[t-1]); - v_f[t] = v_f[t-1] + W2; - Q_f[t] = (1-K_f[t-1])*Q_f[t-1]+W1; - K_f[t] = Q_f[t]/(Q_f[t]+0); - CI_u[t] = mu_f[t]+upperMultiplier*Math.sqrt(Q_f[t]); - CI_l[t] = mu_f[t]-lowerMultiplier*Math.sqrt(Q_f[t]); + mu_f[t] = mu_f[t - 1] + v_f[t - 1] + K_f[t - 1] * (toDbl(y.get(y.size() - 1).get(0)) - mu_f[t - 1]); + v_f[t] = v_f[t - 1] + W2; + Q_f[t] = (1 - K_f[t - 1]) * Q_f[t - 1] + W1; + K_f[t] = Q_f[t] / (Q_f[t] + 0); + CI_u[t] = mu_f[t] + upperMultiplier * Math.sqrt(Q_f[t]); + CI_l[t] = mu_f[t] - lowerMultiplier * Math.sqrt(Q_f[t]); last += diff; - listOfForecasts.add(RowFactory.create(Timestamp.from(Instant.ofEpochMilli(last)), null, mu_f[t], CI_u[t], CI_l[t])); + listOfForecasts + .add(RowFactory.create(Timestamp.from(Instant.ofEpochMilli(last)), null, mu_f[t], CI_u[t], CI_l[t])); } // new df from forecast - final StructType sch2 = new StructType( - new StructField[]{ - StructField.apply("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - StructField.apply("y", DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply("pred", DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply("CI_upper", DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply("CI_lower", DataTypes.DoubleType, true, new MetadataBuilder().build()) - } - ); + final StructType sch2 = new StructType(new StructField[] { + StructField.apply("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + StructField.apply("y", DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply("pred", DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply("CI_upper", DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply("CI_lower", DataTypes.DoubleType, true, new MetadataBuilder().build()) + }); Dataset rv2; if (SparkSession.getActiveSession().nonEmpty()) { @@ -289,16 +293,16 @@ private Dataset ll(Dataset dataset) { int indexOfAliasBeginningQuote = predictCol.toString().indexOf(" AS "); String predictFieldName = "prediction(" + predictCol + ")"; if (indexOfAliasBeginningQuote != -1) { - predictFieldName = predictCol.toString().substring(indexOfAliasBeginningQuote+4); + predictFieldName = predictCol.toString().substring(indexOfAliasBeginningQuote + 4); } // upper/lower confidence interval column naming // default: upperXX(predictField) // customized: abc(predictField) - String upperFieldName = this.upperField == null ? ("upper" + this.upper + "(" + predictFieldName + ")") - : (this.upperField + "(" + predictFieldName + ")"); - String lowerFieldName = this.lowerField == null ? ("lower" + this.lower + "(" + predictFieldName + ")") - : (this.lowerField + "(" + predictFieldName + ")"); + String upperFieldName = this.upperField == null ? ("upper" + this.upper + "(" + predictFieldName + + ")") : (this.upperField + "(" + predictFieldName + ")"); + String lowerFieldName = this.lowerField == null ? ("lower" + this.lower + "(" + predictFieldName + + ")") : (this.lowerField + "(" + predictFieldName + ")"); //label = to predict; "count" //feature = _time @@ -325,47 +329,46 @@ private Dataset ll(Dataset dataset) { double W = 1.271; // State equation variance guess // confidence intervals - double upperMultiplier = (this.upper/100d) + 1d; - double lowerMultiplier = (this.lower/100d) + 1d; + double upperMultiplier = (this.upper / 100d) + 1d; + double lowerMultiplier = (this.lower / 100d) + 1d; // list of rows of predictions List listOfPredRows = new ArrayList<>(); // initial values Q[0] = 2.0; - K[0] = Q[0]/(Q[0]+e); + K[0] = Q[0] / (Q[0] + e); double mu_0 = avg(y); - mu[0] = mu_0+K[0]*(toDbl(y.get(0).get(0))-mu_0); - CI_upper[0] = mu[0]+upperMultiplier*Math.sqrt(Q[0]); - CI_lower[0] = mu[0]-lowerMultiplier*Math.sqrt(Q[0]); + mu[0] = mu_0 + K[0] * (toDbl(y.get(0).get(0)) - mu_0); + CI_upper[0] = mu[0] + upperMultiplier * Math.sqrt(Q[0]); + CI_lower[0] = mu[0] - lowerMultiplier * Math.sqrt(Q[0]); // add first predicted row listOfPredRows.add(RowFactory.create(y.get(0), y.get(0).getTimestamp(1), mu[0], CI_upper[0], CI_lower[0])); for (int t = 1; t < n; t++) { // update measurement - mu[t] = mu[t-1]+K[t-1]*(toDbl(y.get(t-1).get(0))-mu[t-1]); - Q[t] = (1-K[t-1])*Q[t-1]+W; - K[t] = Q[t]/(Q[t]+e); + mu[t] = mu[t - 1] + K[t - 1] * (toDbl(y.get(t - 1).get(0)) - mu[t - 1]); + Q[t] = (1 - K[t - 1]) * Q[t - 1] + W; + K[t] = Q[t] / (Q[t] + e); - CI_upper[t] = mu[t]+upperMultiplier*Math.sqrt(Q[t]); - CI_lower[t] = mu[t]-lowerMultiplier*Math.sqrt(Q[t]); + CI_upper[t] = mu[t] + upperMultiplier * Math.sqrt(Q[t]); + CI_lower[t] = mu[t] - lowerMultiplier * Math.sqrt(Q[t]); //System.out.printf("mu: %s, Q: %s, K: %s, CI-U: %s, CI-L: %s%n", // mu[t], Q[t], K[t], CI_upper[t], CI_lower[t]); - listOfPredRows.add(RowFactory.create(y.get(t).get(0), y.get(t).getTimestamp(1), mu[t], CI_upper[t], CI_lower[t])); + listOfPredRows + .add(RowFactory.create(y.get(t).get(0), y.get(t).getTimestamp(1), mu[t], CI_upper[t], CI_lower[t])); } // generate dataframe from predictions - final StructType sch = new StructType( - new StructField[]{ - StructField.apply("y", DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - StructField.apply(predictFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply(upperFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply(lowerFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()) - } - ); + final StructType sch = new StructType(new StructField[] { + StructField.apply("y", DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + StructField.apply(predictFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply(upperFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply(lowerFieldName, DataTypes.DoubleType, true, new MetadataBuilder().build()) + }); Dataset rv; if (SparkSession.getActiveSession().nonEmpty()) { @@ -379,7 +382,6 @@ private Dataset ll(Dataset dataset) { rv = ds.join(rv, "_time"); rv = rv.orderBy(functions.col("_time").asc()).drop("y"); - // future forecasting: list of forecasts List listOfForecasts = new ArrayList<>(); @@ -402,35 +404,35 @@ private Dataset ll(Dataset dataset) { double[] CI_l = initArrayWithValue(0, f); // add initial forecast - mu_f[0] = mu[mu.length-1]+K[K.length-1]*(toDbl(y.get(y.size()-1).get(0))-mu[mu.length-1]); - Q_f[0] = (1-K[K.length-1])*Q[Q.length-1]+W; - K_f[0] = Q_f[0]/((Q_f[0])+e); - CI_u[0] = mu_f[0]+upperMultiplier*Math.sqrt(Q_f[0]); - CI_l[0] = mu_f[0]-lowerMultiplier*Math.sqrt(Q_f[0]); - - listOfForecasts.add(RowFactory.create(Timestamp.from(Instant.ofEpochMilli(last)),null, mu_f[0], CI_u[0], CI_l[0])); + mu_f[0] = mu[mu.length - 1] + K[K.length - 1] * (toDbl(y.get(y.size() - 1).get(0)) - mu[mu.length - 1]); + Q_f[0] = (1 - K[K.length - 1]) * Q[Q.length - 1] + W; + K_f[0] = Q_f[0] / ((Q_f[0]) + e); + CI_u[0] = mu_f[0] + upperMultiplier * Math.sqrt(Q_f[0]); + CI_l[0] = mu_f[0] - lowerMultiplier * Math.sqrt(Q_f[0]); + + listOfForecasts + .add(RowFactory.create(Timestamp.from(Instant.ofEpochMilli(last)), null, mu_f[0], CI_u[0], CI_l[0])); for (int t = 1; t < f; t++) { // measurement update - mu_f[t] = mu_f[t-1]+K_f[t-1]*(toDbl(y.get(y.size()-1).get(0))-mu_f[t-1]); - Q_f[t] = (1-K_f[t-1])*Q_f[t-1]+W; - K_f[t] = Q_f[t]/(Q_f[t]+0); - CI_u[t] = mu_f[t]+upperMultiplier*Math.sqrt(Q_f[t]); - CI_l[t] = mu_f[t]-lowerMultiplier*Math.sqrt(Q_f[t]); + mu_f[t] = mu_f[t - 1] + K_f[t - 1] * (toDbl(y.get(y.size() - 1).get(0)) - mu_f[t - 1]); + Q_f[t] = (1 - K_f[t - 1]) * Q_f[t - 1] + W; + K_f[t] = Q_f[t] / (Q_f[t] + 0); + CI_u[t] = mu_f[t] + upperMultiplier * Math.sqrt(Q_f[t]); + CI_l[t] = mu_f[t] - lowerMultiplier * Math.sqrt(Q_f[t]); last += diff; - listOfForecasts.add(RowFactory.create(Timestamp.from(Instant.ofEpochMilli(last)), null, mu_f[t], CI_u[t], CI_l[t])); + listOfForecasts + .add(RowFactory.create(Timestamp.from(Instant.ofEpochMilli(last)), null, mu_f[t], CI_u[t], CI_l[t])); } // new df from forecast - final StructType sch2 = new StructType( - new StructField[]{ - StructField.apply("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - StructField.apply("y", DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply("pred", DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply("CI_upper", DataTypes.DoubleType, true, new MetadataBuilder().build()), - StructField.apply("CI_lower", DataTypes.DoubleType, true, new MetadataBuilder().build()) - } - ); + final StructType sch2 = new StructType(new StructField[] { + StructField.apply("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + StructField.apply("y", DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply("pred", DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply("CI_upper", DataTypes.DoubleType, true, new MetadataBuilder().build()), + StructField.apply("CI_lower", DataTypes.DoubleType, true, new MetadataBuilder().build()) + }); Dataset rv2; if (SparkSession.getActiveSession().nonEmpty()) { diff --git a/src/main/java/com/teragrep/pth10/steps/rangemap/AbstractRangemapStep.java b/src/main/java/com/teragrep/pth10/steps/rangemap/AbstractRangemapStep.java index 673e335..dd123c0 100644 --- a/src/main/java/com/teragrep/pth10/steps/rangemap/AbstractRangemapStep.java +++ b/src/main/java/com/teragrep/pth10/steps/rangemap/AbstractRangemapStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -50,6 +50,7 @@ import java.util.Map; public abstract class AbstractRangemapStep extends AbstractStep { + public String sourceField; public String defaultValue = "None"; public Map attributeRangeMap; diff --git a/src/main/java/com/teragrep/pth10/steps/rangemap/RangemapStep.java b/src/main/java/com/teragrep/pth10/steps/rangemap/RangemapStep.java index c5d558b..2e198be 100644 --- a/src/main/java/com/teragrep/pth10/steps/rangemap/RangemapStep.java +++ b/src/main/java/com/teragrep/pth10/steps/rangemap/RangemapStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -55,6 +55,7 @@ import java.util.Map; public final class RangemapStep extends AbstractRangemapStep { + @Override public Dataset get(Dataset dataset) { // check values @@ -80,7 +81,7 @@ public Dataset get(Dataset dataset) { Column mapCol = functions.map_from_arrays(keyCol, valueCol); // apply udf to column "range" - return dataset.withColumn("range", functions.callUDF("RangemapUDF", - functions.col(sourceField), functions.lit(defaultValue), mapCol)); + return dataset + .withColumn("range", functions.callUDF("RangemapUDF", functions.col(sourceField), functions.lit(defaultValue), mapCol)); } } diff --git a/src/main/java/com/teragrep/pth10/steps/rangemap/RangemapUDF.java b/src/main/java/com/teragrep/pth10/steps/rangemap/RangemapUDF.java index c1d81b9..66221fd 100644 --- a/src/main/java/com/teragrep/pth10/steps/rangemap/RangemapUDF.java +++ b/src/main/java/com/teragrep/pth10/steps/rangemap/RangemapUDF.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -54,10 +54,15 @@ import java.util.*; -public class RangemapUDF implements UDF3>, List> { +public class RangemapUDF + implements UDF3>, List> { + @Override - public List call(Object inputNumber, String defaultValue, scala.collection.immutable.Map> attributeRangeMap) - throws Exception { + public List call( + Object inputNumber, + String defaultValue, + scala.collection.immutable.Map> attributeRangeMap + ) throws Exception { // parse numbers List inputs = getAllNumbersFromInput(new ArrayList<>(), inputNumber); // when all strings, then use defaultValue. Otherwise, skip strings. @@ -100,10 +105,12 @@ private List getAllNumbersFromInput(List targetList, Object inpu if (parsedInputNumber.getType() == ParsedResult.Type.LONG) { parsedNumber = parsedInputNumber.getLong(); targetList.add(parsedNumber); - } else if (parsedInputNumber.getType() == ParsedResult.Type.DOUBLE){ + } + else if (parsedInputNumber.getType() == ParsedResult.Type.DOUBLE) { parsedNumber = parsedInputNumber.getDouble(); targetList.add(parsedNumber); - } else if (parsedInputNumber.getType() == ParsedResult.Type.LIST) { + } + else if (parsedInputNumber.getType() == ParsedResult.Type.LIST) { for (Object item : parsedInputNumber.getList()) { targetList.addAll(getAllNumbersFromInput(targetList, item)); } diff --git a/src/main/java/com/teragrep/pth10/steps/regex/AbstractRegexStep.java b/src/main/java/com/teragrep/pth10/steps/regex/AbstractRegexStep.java index 20fa030..2d91b4e 100644 --- a/src/main/java/com/teragrep/pth10/steps/regex/AbstractRegexStep.java +++ b/src/main/java/com/teragrep/pth10/steps/regex/AbstractRegexStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,12 +43,12 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.regex; import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractRegexStep extends AbstractStep { + protected String fromField = null; protected String regexString = null; protected boolean equals = true; diff --git a/src/main/java/com/teragrep/pth10/steps/regex/RegexStep.java b/src/main/java/com/teragrep/pth10/steps/regex/RegexStep.java index b1eebbc..855e4c8 100644 --- a/src/main/java/com/teragrep/pth10/steps/regex/RegexStep.java +++ b/src/main/java/com/teragrep/pth10/steps/regex/RegexStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.regex; import com.teragrep.pth10.ast.commands.transformstatement.regex.RegexMatch; @@ -51,6 +50,7 @@ import org.apache.spark.sql.types.DataTypes; public final class RegexStep extends AbstractRegexStep { + public RegexStep() { super(); } @@ -66,7 +66,8 @@ public Dataset get(Dataset dataset) { ss.udf().register("UDF_Regex", new RegexMatch(), DataTypes.BooleanType); // apply udf - Column regexCol = functions.callUDF("UDF_Regex", functions.col(fromField), functions.lit(regexString), functions.lit(equals)); + Column regexCol = functions + .callUDF("UDF_Regex", functions.col(fromField), functions.lit(regexString), functions.lit(equals)); return dataset.where(regexCol); // apply as where statement } } diff --git a/src/main/java/com/teragrep/pth10/steps/rename/AbstractRenameStep.java b/src/main/java/com/teragrep/pth10/steps/rename/AbstractRenameStep.java index 2b99c01..d2ec909 100644 --- a/src/main/java/com/teragrep/pth10/steps/rename/AbstractRenameStep.java +++ b/src/main/java/com/teragrep/pth10/steps/rename/AbstractRenameStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.rename; import com.teragrep.pth10.steps.AbstractStep; @@ -51,7 +50,9 @@ import java.util.Map; public abstract class AbstractRenameStep extends AbstractStep { + protected Map mapOfRenamedFields = null; + public AbstractRenameStep() { super(); } diff --git a/src/main/java/com/teragrep/pth10/steps/rename/RenameStep.java b/src/main/java/com/teragrep/pth10/steps/rename/RenameStep.java index c69120a..d97b47a 100644 --- a/src/main/java/com/teragrep/pth10/steps/rename/RenameStep.java +++ b/src/main/java/com/teragrep/pth10/steps/rename/RenameStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.rename; import org.apache.spark.sql.Dataset; @@ -51,7 +50,8 @@ import java.util.Map; -public final class RenameStep extends AbstractRenameStep{ +public final class RenameStep extends AbstractRenameStep { + public RenameStep() { super(); } diff --git a/src/main/java/com/teragrep/pth10/steps/replace/AbstractReplaceStep.java b/src/main/java/com/teragrep/pth10/steps/replace/AbstractReplaceStep.java index 7b7b203..d788c21 100644 --- a/src/main/java/com/teragrep/pth10/steps/replace/AbstractReplaceStep.java +++ b/src/main/java/com/teragrep/pth10/steps/replace/AbstractReplaceStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.replace; import com.teragrep.pth10.steps.AbstractStep; @@ -52,6 +51,7 @@ import java.util.Map; public abstract class AbstractReplaceStep extends AbstractStep { + protected final List listOfFields; protected final Map replacements; diff --git a/src/main/java/com/teragrep/pth10/steps/replace/ReplaceStep.java b/src/main/java/com/teragrep/pth10/steps/replace/ReplaceStep.java index db0f82a..0b3aca3 100644 --- a/src/main/java/com/teragrep/pth10/steps/replace/ReplaceStep.java +++ b/src/main/java/com/teragrep/pth10/steps/replace/ReplaceStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.replace; import com.teragrep.pth10.ast.commands.transformstatement.replace.ReplaceCmd; @@ -54,6 +53,7 @@ import java.util.Map; public final class ReplaceStep extends AbstractReplaceStep { + public ReplaceStep(List listOfFields, Map replacements) { super(listOfFields, replacements); } @@ -82,8 +82,8 @@ public Dataset get(Dataset dataset) { // Apply the replace function to all given fields for (String field : this.listOfFields) { for (String contentToReplace : this.replacements.keySet()) { - Column res = functions.callUDF("UDF_Replace", - functions.col(field), functions.lit(contentToReplace), functions.lit(this.replacements.get(contentToReplace))); + Column res = functions + .callUDF("UDF_Replace", functions.col(field), functions.lit(contentToReplace), functions.lit(this.replacements.get(contentToReplace))); dataset = dataset.withColumn(field, res); } diff --git a/src/main/java/com/teragrep/pth10/steps/rex/AbstractRexStep.java b/src/main/java/com/teragrep/pth10/steps/rex/AbstractRexStep.java index f6106fc..c40c22b 100644 --- a/src/main/java/com/teragrep/pth10/steps/rex/AbstractRexStep.java +++ b/src/main/java/com/teragrep/pth10/steps/rex/AbstractRexStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,13 +43,13 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.rex; import com.teragrep.pth10.ast.DPLParserCatalystContext; import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractRexStep extends AbstractStep { + protected String regexStr = null; protected String field = "_raw"; protected String offsetField = null; diff --git a/src/main/java/com/teragrep/pth10/steps/rex/RexStep.java b/src/main/java/com/teragrep/pth10/steps/rex/RexStep.java index c74d123..e115eed 100644 --- a/src/main/java/com/teragrep/pth10/steps/rex/RexStep.java +++ b/src/main/java/com/teragrep/pth10/steps/rex/RexStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.rex; import com.teragrep.jpr_01.JavaPcre; @@ -56,6 +55,7 @@ import java.util.Map; public final class RexStep extends AbstractRexStep { + public RexStep() { super(); } @@ -78,9 +78,11 @@ public Dataset get(Dataset dataset) { } else { // extract mode - UserDefinedFunction udf = functions.udf(new RexExtractModeUDF(), DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType)); + UserDefinedFunction udf = functions + .udf(new RexExtractModeUDF(), DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType)); ss.udf().register("RexExtractUDF", udf); - Column udfResult = functions.callUDF("RexExtractUDF", functions.col(this.field), functions.lit(this.regexStr)); + Column udfResult = functions + .callUDF("RexExtractUDF", functions.col(this.field), functions.lit(this.regexStr)); final String outputCol = "$$dpl_internal_rex_result$$"; res = res.withColumn(outputCol, udfResult); @@ -92,11 +94,8 @@ public Dataset get(Dataset dataset) { // go through the capture group names and if value is null, apply NullValue, otherwise value for (String name : nameTable.keySet()) { - res = res.withColumn( - name, - functions.when( - functions.isnull(res.col(outputCol).getItem(name)), - functions.lit(catCtx.nullValue.value()).cast(DataTypes.StringType)).otherwise(res.col(outputCol).getItem(name))); + res = res + .withColumn(name, functions.when(functions.isnull(res.col(outputCol).getItem(name)), functions.lit(catCtx.nullValue.value()).cast(DataTypes.StringType)).otherwise(res.col(outputCol).getItem(name))); } // drop intermediate result column diff --git a/src/main/java/com/teragrep/pth10/steps/rex4j/AbstractRex4jStep.java b/src/main/java/com/teragrep/pth10/steps/rex4j/AbstractRex4jStep.java index 9a1c87f..3839ce8 100644 --- a/src/main/java/com/teragrep/pth10/steps/rex4j/AbstractRex4jStep.java +++ b/src/main/java/com/teragrep/pth10/steps/rex4j/AbstractRex4jStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,12 +43,12 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.rex4j; import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractRex4jStep extends AbstractStep { + protected String sedMode = null; protected String regexStr = null; protected String field = "_raw"; diff --git a/src/main/java/com/teragrep/pth10/steps/rex4j/Rex4jStep.java b/src/main/java/com/teragrep/pth10/steps/rex4j/Rex4jStep.java index ac69f9d..a9964ce 100644 --- a/src/main/java/com/teragrep/pth10/steps/rex4j/Rex4jStep.java +++ b/src/main/java/com/teragrep/pth10/steps/rex4j/Rex4jStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.rex4j; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -58,9 +57,11 @@ import java.util.Map; -public final class Rex4jStep extends AbstractRex4jStep{ +public final class Rex4jStep extends AbstractRex4jStep { + private static final Logger LOGGER = LoggerFactory.getLogger(Rex4jStep.class); private final DPLParserCatalystContext catCtx; + public Rex4jStep(DPLParserCatalystContext catCtx) { super(); this.catCtx = catCtx; @@ -83,15 +84,21 @@ public Dataset get(Dataset dataset) { // FIXME Implement character substitute mode and Nth occurrence flag // y/ and /N, where N>0 if (sed.length < 4) { - throw new RuntimeException("Invalid sedMode string given in rex4j: " + regexStr + "\nExpected: s/regexp/replacement/g"); + throw new RuntimeException( + "Invalid sedMode string given in rex4j: " + regexStr + "\nExpected: s/regexp/replacement/g" + ); } if (!sed[0].equals("s")) { - throw new UnsupportedOperationException("Only replace strings mode (s/) is supported as of now. Expected: s, Actual: " + sed[0]); + throw new UnsupportedOperationException( + "Only replace strings mode (s/) is supported as of now. Expected: s, Actual: " + sed[0] + ); } if (!sed[3].equals("g")) { - throw new UnsupportedOperationException("Only global flag (/g) is supported as of now. Expected: g, Actual: " + sed[3]); + throw new UnsupportedOperationException( + "Only global flag (/g) is supported as of now. Expected: g, Actual: " + sed[3] + ); } Column rex = functions.regexp_replace(new Column(field), sed[1], sed[2]); @@ -102,7 +109,9 @@ public Dataset get(Dataset dataset) { // a namedGroup must exist if (fields.isEmpty()) { - throw new IllegalArgumentException("Error in rex4j command, regexp-string missing mandatory match groups."); + throw new IllegalArgumentException( + "Error in rex4j command, regexp-string missing mandatory match groups." + ); } // go through multi extraction groups @@ -112,8 +121,8 @@ public Dataset get(Dataset dataset) { Integer in = me.getValue(); // perform regexp_extract rex = functions.regexp_extract(functions.col(field), regexStr, in); - res = res.withColumn(me.getKey(), functions.when(rex.eqNullSafe(functions.lit("")), - functions.lit(catCtx.nullValue.value())).otherwise(rex)); + res = res + .withColumn(me.getKey(), functions.when(rex.eqNullSafe(functions.lit("")), functions.lit(catCtx.nullValue.value())).otherwise(rex)); } return res; } diff --git a/src/main/java/com/teragrep/pth10/steps/search/AbstractSearchStep.java b/src/main/java/com/teragrep/pth10/steps/search/AbstractSearchStep.java index c318388..b5d8650 100644 --- a/src/main/java/com/teragrep/pth10/steps/search/AbstractSearchStep.java +++ b/src/main/java/com/teragrep/pth10/steps/search/AbstractSearchStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -49,6 +49,7 @@ import org.apache.spark.sql.Column; public abstract class AbstractSearchStep extends AbstractStep { + protected Column filteringColumn; public Column getFilteringColumn() { diff --git a/src/main/java/com/teragrep/pth10/steps/search/SearchStep.java b/src/main/java/com/teragrep/pth10/steps/search/SearchStep.java index 73f6593..151d92d 100644 --- a/src/main/java/com/teragrep/pth10/steps/search/SearchStep.java +++ b/src/main/java/com/teragrep/pth10/steps/search/SearchStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -51,7 +51,9 @@ import org.slf4j.LoggerFactory; public class SearchStep extends AbstractSearchStep { + private static final Logger LOGGER = LoggerFactory.getLogger(SearchStep.class); + @Override public Dataset get(Dataset dataset) { LOGGER.info("Filtering search with column: <{}>", this.filteringColumn.toString()); diff --git a/src/main/java/com/teragrep/pth10/steps/sendemail/AbstractSendemailStep.java b/src/main/java/com/teragrep/pth10/steps/sendemail/AbstractSendemailStep.java index b3ce228..ed420df 100644 --- a/src/main/java/com/teragrep/pth10/steps/sendemail/AbstractSendemailStep.java +++ b/src/main/java/com/teragrep/pth10/steps/sendemail/AbstractSendemailStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,13 +43,13 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.sendemail; import com.teragrep.pth10.ast.commands.transformstatement.sendemail.SendemailResultsProcessor; import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractSendemailStep extends AbstractStep { + protected SendemailResultsProcessor sendemailResultsProcessor = null; protected boolean sendResults = false; diff --git a/src/main/java/com/teragrep/pth10/steps/sendemail/SendemailStep.java b/src/main/java/com/teragrep/pth10/steps/sendemail/SendemailStep.java index af84d19..c63a1cd 100644 --- a/src/main/java/com/teragrep/pth10/steps/sendemail/SendemailStep.java +++ b/src/main/java/com/teragrep/pth10/steps/sendemail/SendemailStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,15 +43,14 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.sendemail; -import com.teragrep.pth10.ast.commands.transformstatement.sendemail.SendemailResultsProcessor; import com.teragrep.pth10.steps.Flushable; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; public final class SendemailStep extends AbstractSendemailStep implements Flushable { + public SendemailStep() { super(); this.properties.add(CommandProperty.SEQUENTIAL_ONLY); @@ -61,7 +60,8 @@ public SendemailStep() { public void flush() { try { this.sendemailResultsProcessor.flush(); - } catch (Exception e) { + } + catch (Exception e) { throw new RuntimeException("Error flushing sendemail: " + e); } } @@ -80,10 +80,12 @@ else if (dataset.isEmpty()) { // only collect if results need to be sent if (this.sendResults) { this.sendemailResultsProcessor.call(dataset.collectAsList()); - } else if (!this.sendemailResultsProcessor.getIsCalledBefore()) { + } + else if (!this.sendemailResultsProcessor.getIsCalledBefore()) { this.sendemailResultsProcessor.call(); } - } catch (Exception e) { + } + catch (Exception e) { throw new RuntimeException(e); } diff --git a/src/main/java/com/teragrep/pth10/steps/sort/AbstractSortStep.java b/src/main/java/com/teragrep/pth10/steps/sort/AbstractSortStep.java index c75f9e5..1c37167 100644 --- a/src/main/java/com/teragrep/pth10/steps/sort/AbstractSortStep.java +++ b/src/main/java/com/teragrep/pth10/steps/sort/AbstractSortStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,19 +43,17 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.sort; import com.teragrep.functions.dpf_02.BatchCollect; import com.teragrep.functions.dpf_02.SortByClause; import com.teragrep.pth10.ast.DPLParserCatalystContext; import com.teragrep.pth10.steps.AbstractStep; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; import java.util.List; public abstract class AbstractSortStep extends AbstractStep { + protected int limit; protected boolean desc = false; protected List listOfSortByClauses = null; diff --git a/src/main/java/com/teragrep/pth10/steps/sort/AggregatedSort.java b/src/main/java/com/teragrep/pth10/steps/sort/AggregatedSort.java index 3348c61..1f13ae6 100644 --- a/src/main/java/com/teragrep/pth10/steps/sort/AggregatedSort.java +++ b/src/main/java/com/teragrep/pth10/steps/sort/AggregatedSort.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.sort; import com.teragrep.functions.dpf_02.SortByClause; diff --git a/src/main/java/com/teragrep/pth10/steps/sort/SortStep.java b/src/main/java/com/teragrep/pth10/steps/sort/SortStep.java index 41e5e72..1687616 100644 --- a/src/main/java/com/teragrep/pth10/steps/sort/SortStep.java +++ b/src/main/java/com/teragrep/pth10/steps/sort/SortStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.sort; import com.teragrep.functions.dpf_02.BatchCollect; @@ -57,7 +56,9 @@ import java.util.List; public final class SortStep extends AbstractSortStep { + private static final Logger LOGGER = LoggerFactory.getLogger(SortStep.class); + public SortStep(DPLParserCatalystContext catCtx, List listOfSortByClauses, int limit, boolean desc) { super(); this.properties.add(CommandProperty.SEQUENTIAL_ONLY); @@ -70,7 +71,6 @@ public SortStep(DPLParserCatalystContext catCtx, List listOfSortBy this.sortingBatchCollect = new BatchCollect(null, limit, listOfSortByClauses); } - @Override public Dataset get(Dataset dataset) { if (dataset == null) { @@ -80,7 +80,8 @@ public Dataset get(Dataset dataset) { if (this.aggregatesUsedBefore) { LOGGER.info("Aggregates used: using regular sorting"); return aggregatedSort(dataset); - } else { + } + else { LOGGER.info("Aggregates not used: using BatchCollect to sort"); return sort(dataset); } @@ -89,6 +90,7 @@ public Dataset get(Dataset dataset) { /** * Performs a sort on a unsorted dataframe, using a RowComparator.
* Collects all rows of the current batch to the driver. + * * @param unsortedDs dataframe to sort * @return sorted dataframe */ diff --git a/src/main/java/com/teragrep/pth10/steps/spath/AbstractSpathStep.java b/src/main/java/com/teragrep/pth10/steps/spath/AbstractSpathStep.java index 0914a16..453c31a 100644 --- a/src/main/java/com/teragrep/pth10/steps/spath/AbstractSpathStep.java +++ b/src/main/java/com/teragrep/pth10/steps/spath/AbstractSpathStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,13 +43,13 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.spath; import com.teragrep.pth10.ast.DPLParserCatalystContext; import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractSpathStep extends AbstractStep { + protected String inputColumn = null; protected String outputColumn = null; protected String path = null; diff --git a/src/main/java/com/teragrep/pth10/steps/spath/SpathStep.java b/src/main/java/com/teragrep/pth10/steps/spath/SpathStep.java index 5caf260..b5033f0 100644 --- a/src/main/java/com/teragrep/pth10/steps/spath/SpathStep.java +++ b/src/main/java/com/teragrep/pth10/steps/spath/SpathStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.spath; import com.teragrep.pth10.ast.MapTypeColumn; @@ -58,6 +57,7 @@ import java.util.*; public final class SpathStep extends AbstractSpathStep { + public SpathStep() { super(); } @@ -75,20 +75,20 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // register udf with SparkSession and get column ss.udf().register("UDF_Spath", new Spath(catCtx.nullValue), returnType); - Column spathExpr = functions.callUDF( - "UDF_Spath", // name of UDF - functions.col(inputColumn), // Input column (actual data to run spath on) - functions.lit(path), // Path to extract data from, usually mainkey.someotherkey - functions.lit(inputColumn), // Name of input column (no data) - functions.lit(outputColumn) // Name of output column (no data) - ); + Column spathExpr = functions + .callUDF( + "UDF_Spath", // name of UDF + functions.col(inputColumn), // Input column (actual data to run spath on) + functions.lit(path), // Path to extract data from, usually mainkey.someotherkey + functions.lit(inputColumn), // Name of input column (no data) + functions.lit(outputColumn) // Name of output column (no data) + ); // Not in auto-extraction mode: can just return the first and only value from the map if (!autoExtractionMode) { return dataset.withColumn(new UnquotedText(new TextString(outputColumn)).read(), spathExpr.getItem(path)); } - // // auto-extraction mode // @@ -107,15 +107,18 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // Check for nulls; return an empty string if null, otherwise value for given key for (String key : keys) { - withAppliedUdfDs = withAppliedUdfDs.withColumn( - key, - functions.when( - /* if key.value == null */ - functions.isnull(withAppliedUdfDs.col(outputColumn).getItem(key)), - /* then return empty string */ - functions.lit("")) - /* otherwise return key.value */ - .otherwise(withAppliedUdfDs.col(outputColumn).getItem(key))); + withAppliedUdfDs = withAppliedUdfDs + .withColumn( + key, functions + .when( + /* if key.value == null */ + functions.isnull(withAppliedUdfDs.col(outputColumn).getItem(key)), + /* then return empty string */ + functions.lit("") + ) + /* otherwise return key.value */ + .otherwise(withAppliedUdfDs.col(outputColumn).getItem(key)) + ); } // Output column can be dropped diff --git a/src/main/java/com/teragrep/pth10/steps/stats/AbstractStatsStep.java b/src/main/java/com/teragrep/pth10/steps/stats/AbstractStatsStep.java index 029272f..fcd58cd 100644 --- a/src/main/java/com/teragrep/pth10/steps/stats/AbstractStatsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/stats/AbstractStatsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.stats; import com.teragrep.pth10.steps.AbstractStep; @@ -70,4 +69,3 @@ public List getListOfAggregationExpressions() { return listOfAggregationExpressions; } } - diff --git a/src/main/java/com/teragrep/pth10/steps/stats/StatsStep.java b/src/main/java/com/teragrep/pth10/steps/stats/StatsStep.java index 1545056..dc616b2 100644 --- a/src/main/java/com/teragrep/pth10/steps/stats/StatsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/stats/StatsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.stats; import org.apache.spark.sql.Column; @@ -54,7 +53,8 @@ import java.util.List; -public final class StatsStep extends AbstractStatsStep{ +public final class StatsStep extends AbstractStatsStep { + public StatsStep(List listOfAggregationExpressions, List listOfGroupBys) { super(listOfAggregationExpressions, listOfGroupBys); this.properties.add(CommandProperty.AGGREGATE); @@ -72,14 +72,15 @@ public Dataset get(Dataset dataset) { } Column mainExpr = this.listOfAggregationExpressions.get(0); - Seq seqOfAggs = JavaConversions.asScalaBuffer( - this.listOfAggregationExpressions.subList(1, this.listOfAggregationExpressions.size())); + Seq seqOfAggs = JavaConversions + .asScalaBuffer(this.listOfAggregationExpressions.subList(1, this.listOfAggregationExpressions.size())); // Check for any group by expressions if (!this.listOfGroupBys.isEmpty()) { Seq seqOfGroupBys = JavaConversions.asScalaBuffer(this.listOfGroupBys); return dataset.groupBy(seqOfGroupBys).agg(mainExpr, seqOfAggs); - } else { + } + else { // No group by, just perform a direct aggregation return dataset.agg(mainExpr, seqOfAggs); } diff --git a/src/main/java/com/teragrep/pth10/steps/strcat/AbstractStrcatStep.java b/src/main/java/com/teragrep/pth10/steps/strcat/AbstractStrcatStep.java index 52bfd33..c967384 100644 --- a/src/main/java/com/teragrep/pth10/steps/strcat/AbstractStrcatStep.java +++ b/src/main/java/com/teragrep/pth10/steps/strcat/AbstractStrcatStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.strcat; import com.teragrep.pth10.steps.AbstractStep; @@ -52,6 +51,7 @@ import java.util.List; public abstract class AbstractStrcatStep extends AbstractStep { + public List getListOfFields() { return listOfFields; } diff --git a/src/main/java/com/teragrep/pth10/steps/strcat/StrcatStep.java b/src/main/java/com/teragrep/pth10/steps/strcat/StrcatStep.java index 17c1a01..117e4be 100644 --- a/src/main/java/com/teragrep/pth10/steps/strcat/StrcatStep.java +++ b/src/main/java/com/teragrep/pth10/steps/strcat/StrcatStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.strcat; import com.teragrep.pth10.ast.NullValue; @@ -62,7 +61,9 @@ import java.util.stream.Collectors; public final class StrcatStep extends AbstractStrcatStep { + private final NullValue nullValue; + public StrcatStep(NullValue nullValue) { super(); this.nullValue = nullValue; @@ -108,14 +109,16 @@ public Dataset get(Dataset dataset) { /** * Removes non-existing fields from the list of field names + * * @param fields list of fields * @return list of fields without non-existing fields */ private List removeNonExistingColumns(List fields, Dataset dataset) { List fieldsRemoved = new ArrayList<>(); fields.forEach(field -> { - if (Arrays.toString(dataset.columns()).contains(field) || - (field.startsWith("\"") && field.endsWith("\""))) { + if ( + Arrays.toString(dataset.columns()).contains(field) || (field.startsWith("\"") && field.endsWith("\"")) + ) { fieldsRemoved.add(field); } }); diff --git a/src/main/java/com/teragrep/pth10/steps/subsearch/AbstractSubsearchStep.java b/src/main/java/com/teragrep/pth10/steps/subsearch/AbstractSubsearchStep.java index 90f135b..335140d 100644 --- a/src/main/java/com/teragrep/pth10/steps/subsearch/AbstractSubsearchStep.java +++ b/src/main/java/com/teragrep/pth10/steps/subsearch/AbstractSubsearchStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -50,10 +50,12 @@ import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractSubsearchStep extends AbstractStep { + public enum SubSearchType { MAIN_SEARCH_FILTERING, // for use in "LogicalStatement" subsearch (e.g. index=a [ search index=b ]) JOIN_COMMAND_SUBSEARCH // for use in join command's [ ] brackets (e.g. | join (...) [ search index=b ]) } + protected StepList stepList; protected DPLInternalStreamingQueryListener listener; protected String hdfsPath; diff --git a/src/main/java/com/teragrep/pth10/steps/subsearch/SubsearchStep.java b/src/main/java/com/teragrep/pth10/steps/subsearch/SubsearchStep.java index ae264e9..c7b025f 100644 --- a/src/main/java/com/teragrep/pth10/steps/subsearch/SubsearchStep.java +++ b/src/main/java/com/teragrep/pth10/steps/subsearch/SubsearchStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.subsearch; import com.teragrep.pth10.ast.StepList; @@ -62,11 +61,12 @@ import java.util.regex.Pattern; /** - * Step for filtering the dataset generated in LogicalXMLStep. - * This Step needs its own StepList. It has to filter the same dataset twice: first with the stepList given to it, and - * then it reads the results from that dataset and filters the dataset with the results. + * Step for filtering the dataset generated in LogicalXMLStep. This Step needs its own StepList. It has to filter the + * same dataset twice: first with the stepList given to it, and then it reads the results from that dataset and filters + * the dataset with the results. */ public final class SubsearchStep extends AbstractSubsearchStep { + private static final Logger LOGGER = LoggerFactory.getLogger(SubsearchStep.class); public SubsearchStep(StepList stepList) { @@ -91,7 +91,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { final Map mapOfColumnNames = new HashMap<>(); final StructField[] subsearchFields = subSearchDs.schema().fields(); for (final StructField field : subsearchFields) { - final String encodedName = "HEX".concat(Hex.encodeHexString(field.name().getBytes(StandardCharsets.UTF_8))); + final String encodedName = "HEX" + .concat(Hex.encodeHexString(field.name().getBytes(StandardCharsets.UTF_8))); subSearchDs = subSearchDs.withColumnRenamed(field.name(), encodedName); mapOfColumnNames.put(encodedName, field.name()); } @@ -101,16 +102,15 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { final String hdfsPath = this.hdfsPath; final String cpPath = hdfsPath + "checkpoint/sub/" + randomID; final String path = hdfsPath + "data/sub/" + randomID; - DataStreamWriter subToDiskWriter = - subSearchDs - .repartition(1) - .writeStream() - .format("avro") - .trigger(Trigger.ProcessingTime(0)) - // .option("spark.cleaner.referenceTracking.cleanCheckpoints", "true") - .option("checkpointLocation", cpPath) - .option("path", path) - .outputMode(OutputMode.Append()); + DataStreamWriter subToDiskWriter = subSearchDs + .repartition(1) + .writeStream() + .format("avro") + .trigger(Trigger.ProcessingTime(0)) + // .option("spark.cleaner.referenceTracking.cleanCheckpoints", "true") + .option("checkpointLocation", cpPath) + .option("path", path) + .outputMode(OutputMode.Append()); SparkSession ss = SparkSession.builder().getOrCreate(); @@ -119,7 +119,6 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // await for listener to stop the subToDiskQuery subToDiskQuery.awaitTermination(); - // read subsearch data from disk and collect Dataset readFromDisk = ss.read().schema(subSearchDs.schema()).format("avro").load(path); @@ -137,7 +136,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { for (int i = 0; i < collectedRow.length(); i++) { String rowContent = collectedRow.get(i).toString(); if (filterColumn != null) { - filterColumn = filterColumn.or(functions.col("_raw").rlike("(?i)^.*" + Pattern.quote(rowContent) + ".*$")); + filterColumn = filterColumn + .or(functions.col("_raw").rlike("(?i)^.*" + Pattern.quote(rowContent) + ".*$")); } else { filterColumn = functions.col("_raw").rlike("(?i)^.*" + Pattern.quote(rowContent) + ".*$"); @@ -145,7 +145,6 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { } } - if (filterColumn == null) { throw new IllegalStateException("Generated filter column via subsearch was null!"); } diff --git a/src/main/java/com/teragrep/pth10/steps/table/AbstractTableStep.java b/src/main/java/com/teragrep/pth10/steps/table/AbstractTableStep.java index 7e8b29b..730c8f7 100644 --- a/src/main/java/com/teragrep/pth10/steps/table/AbstractTableStep.java +++ b/src/main/java/com/teragrep/pth10/steps/table/AbstractTableStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.table; import com.teragrep.pth10.steps.AbstractStep; @@ -52,6 +51,7 @@ import java.util.List; public abstract class AbstractTableStep extends AbstractStep { + protected List listOfFields = new ArrayList<>(); public AbstractTableStep() { diff --git a/src/main/java/com/teragrep/pth10/steps/table/TableStep.java b/src/main/java/com/teragrep/pth10/steps/table/TableStep.java index c4cb075..2412acd 100644 --- a/src/main/java/com/teragrep/pth10/steps/table/TableStep.java +++ b/src/main/java/com/teragrep/pth10/steps/table/TableStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.table; import org.apache.spark.sql.Column; @@ -62,6 +61,7 @@ import java.util.stream.Collectors; public final class TableStep extends AbstractTableStep { + private static final Logger LOGGER = LoggerFactory.getLogger(TableStep.class); public TableStep() { @@ -91,9 +91,8 @@ public Dataset get(Dataset dataset) { } // reorder them to be in the same order as in the table command - Seq seqOfCols = JavaConversions.asScalaBuffer( - wildcardedFields.stream().map(functions::col).collect(Collectors.toList()) - ); + Seq seqOfCols = JavaConversions + .asScalaBuffer(wildcardedFields.stream().map(functions::col).collect(Collectors.toList())); assert dsWithDroppedCols != null : "Dropped columns dataset was null"; @@ -108,7 +107,8 @@ public Dataset get(Dataset dataset) { /** * Gets wildcarded fields from given array of column names - * @param wc wildcard statement + * + * @param wc wildcard statement * @param cols array of column names * @return list of column names which match the wildcard statement */ @@ -126,16 +126,18 @@ private List getWildcardFields(String wc, String[] cols) { quotablePartBuilder.setLength(0); } regexBuilder.append(".*"); - } else { + } + else { // On normal characters, add to quotablePartBuilder - quotablePartBuilder.append(c); + quotablePartBuilder.append(c); } } if (quotablePartBuilder.length() > 0) { // if quotablePartBuilder is not empty, quote and add it regex = Pattern.quote(quotablePartBuilder.toString()); - } else { + } + else { // if it is empty, the regexBuilder contains the final regex regex = regexBuilder.toString(); } diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/DecompressibleInputStream.java b/src/main/java/com/teragrep/pth10/steps/teragrep/DecompressibleInputStream.java index 844c6cd..b7719d1 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/DecompressibleInputStream.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/DecompressibleInputStream.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -51,6 +51,7 @@ import java.io.ObjectStreamClass; public class DecompressibleInputStream extends ObjectInputStream { + public DecompressibleInputStream(InputStream in) throws IOException { super(in); } diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepBloomStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepBloomStep.java index 4c375da..be8ad37 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepBloomStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepBloomStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep; import com.teragrep.functions.dpf_03.BloomFilterAggregator; @@ -68,12 +67,9 @@ * teragrep exec bloom */ public class TeragrepBloomStep extends AbstractStep { + public enum BloomMode { - UPDATE, - CREATE, - ESTIMATE, - AGGREGATE, - DEFAULT + UPDATE, CREATE, ESTIMATE, AGGREGATE, DEFAULT } private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepBloomStep.class); @@ -91,8 +87,13 @@ public enum BloomMode { public final static String BLOOM_NUMBER_OF_FIELDS_CONFIG_ITEM = "dpl.pth_06.bloom.db.fields"; public final static Double MAX_FPP = 0.01; - public TeragrepBloomStep(Config zeppelinConfig, BloomMode mode, - String inputCol, String outputCol, String estimateCol) { + public TeragrepBloomStep( + Config zeppelinConfig, + BloomMode mode, + String inputCol, + String outputCol, + String estimateCol + ) { this.zeppelinConfig = zeppelinConfig; this.mode = mode; this.inputCol = inputCol; @@ -122,9 +123,10 @@ public Dataset get(Dataset dataset) { rv = aggregate(dataset); break; default: - throw new UnsupportedOperationException("Selected bloom command is not supported. " + - "Supported commands: exec bloom create, exec bloom update, exec bloom estimate," + - "."); + throw new UnsupportedOperationException( + "Selected bloom command is not supported. " + + "Supported commands: exec bloom create, exec bloom update, exec bloom estimate," + "." + ); } return rv; @@ -132,6 +134,7 @@ public Dataset get(Dataset dataset) { /** * Create and store a bloom filter byte generated from Datasets rows _raw column (Ignores duplicates) + * * @param dataset Dataset that is used to update database * @return Dataset unmodified */ @@ -146,6 +149,7 @@ private Dataset createBloomFilter(Dataset dataset) { /** * Create and store a bloom filter byte arrays generated from Datasets rows _raw column (Replaces duplicates) + * * @param dataset Dataset that is used to update database * @return Dataset unmodified */ @@ -159,28 +163,19 @@ private Dataset updateBloomFilter(Dataset dataset) { } private Dataset estimateSize(Dataset dataset) { - return dataset.select( - functions.col("partition"), - functions.explode( - functions.col(inputCol) - ).as("token") - ) + return dataset + .select(functions.col("partition"), functions.explode(functions.col(inputCol)).as("token")) .groupBy("partition") - .agg( - functions.approxCountDistinct("token") - .as(outputCol) - ); + .agg(functions.approxCountDistinct("token").as(outputCol)); } public Dataset aggregate(Dataset dataset) { FilterSizes filterSizes = new FilterSizes(this.zeppelinConfig); - BloomFilterAggregator agg = - new BloomFilterAggregator(inputCol, estimateCol, filterSizes.asSortedMap()); + BloomFilterAggregator agg = new BloomFilterAggregator(inputCol, estimateCol, filterSizes.asSortedMap()); - return dataset.groupBy("partition") - .agg(agg.toColumn().as("bloomfilter")); + return dataset.groupBy("partition").agg(agg.toColumn().as("bloomfilter")); } @@ -190,9 +185,12 @@ private void writeFilterSizesToDatabase(Config config) { Connection connection = new LazyConnection(config).get(); SortedMap filterSizeMap = filterSizes.asSortedMap(); - for(Map.Entry entry : filterSizeMap.entrySet()) { - LOGGER.info("Writing filtertype[expected: <{}>, fpp: <{}>] to bloomdb.filtertype", - entry.getKey(), entry.getValue()); + for (Map.Entry entry : filterSizeMap.entrySet()) { + LOGGER + .info( + "Writing filtertype[expected: <{}>, fpp: <{}>] to bloomdb.filtertype", entry.getKey(), + entry.getValue() + ); String sql = "INSERT IGNORE INTO `filtertype` (`expectedElements`, `targetFpp`) VALUES (?, ?)"; @@ -205,8 +203,13 @@ private void writeFilterSizesToDatabase(Config config) { connection.commit(); - } catch (SQLException e) { - LOGGER.error("Error writing filter[expected: <{}>, fpp: <{}>] into database", entry.getKey(), entry.getValue()); + } + catch (SQLException e) { + LOGGER + .error( + "Error writing filter[expected: <{}>, fpp: <{}>] into database", entry.getKey(), + entry.getValue() + ); throw new RuntimeException(e); } } diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepDynatraceStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepDynatraceStep.java index 45740b7..b01cc21 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepDynatraceStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepDynatraceStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -75,11 +75,13 @@ import java.util.List; public class TeragrepDynatraceStep extends AbstractStep implements Flushable { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepDynatraceStep.class); private final String metricKey; private final String metricsApiUrl; private final DPLParserCatalystContext catCtx; private List dynatraceItems; + public TeragrepDynatraceStep(DPLParserCatalystContext catCtx, String metricKey, String metricsApiUrl) { super(); this.catCtx = catCtx; @@ -88,6 +90,7 @@ public TeragrepDynatraceStep(DPLParserCatalystContext catCtx, String metricKey, this.properties.add(CommandProperty.SEQUENTIAL_ONLY); this.properties.add(CommandProperty.REQUIRE_PRECEDING_AGGREGATE); } + @Override public Dataset get(Dataset dataset) { dynatraceItems = new ArrayList<>(); @@ -108,27 +111,32 @@ else if (name.startsWith("min(") && name.endsWith(")")) { // min(column) final String col = new NumericText(new TextString(row.get(j))).read(); dti.setMin(col); - } else if (name.startsWith("max(") && name.endsWith(")")) { + } + else if (name.startsWith("max(") && name.endsWith(")")) { // max(column) final String col = new NumericText(new TextString(row.get(j))).read(); dti.setMax(col); - } else if (name.startsWith("sum(") && name.endsWith(")")) { + } + else if (name.startsWith("sum(") && name.endsWith(")")) { // sum(column) final String col = new NumericText(new TextString(row.get(j))).read(); dti.setSum(col); - } else if (name.startsWith("count(") && name.endsWith(")")) { + } + else if (name.startsWith("count(") && name.endsWith(")")) { // count(column) final String col = new NumericText(new TextString(row.get(j))).read(); dti.setCount(col); - } else if (name.indexOf('(') > 0 && name.endsWith(")")) { + } + else if (name.indexOf('(') > 0 && name.endsWith(")")) { // (column) final String col = new NumericText(new TextString(row.get(j))).read(); final String underscorified = name.replaceAll("[(|)]", "_"); dti.addAggregate(underscorified, col); - } else { + } + else { // anything else should be a dimension final String col = row.get(j).toString(); dti.addDimension(name, col); @@ -150,13 +158,15 @@ private void sendPostReq(String urlStr, DynatraceItem dti) throws IOException { httpPost.setHeader("Content-Type", "text/plain; charset=utf-8"); httpPost.setEntity(new StringEntity(dti.toString(), "utf-8")); - try (CloseableHttpClient client = HttpClients.createDefault(); - CloseableHttpResponse response = client.execute(httpPost)) { + try ( + CloseableHttpClient client = HttpClients.createDefault(); + CloseableHttpResponse response = client.execute(httpPost) + ) { final int statusCode = response.getStatusLine().getStatusCode(); try (InputStream respStream = response.getEntity().getContent()) { - JsonObject jsonResp = new Gson().fromJson( - new InputStreamReader(respStream, StandardCharsets.UTF_8), JsonObject.class); + JsonObject jsonResp = new Gson() + .fromJson(new InputStreamReader(respStream, StandardCharsets.UTF_8), JsonObject.class); JsonElement errorElem = jsonResp.get("error"); if (!(errorElem instanceof JsonNull)) { throw new RuntimeException("Error from server response: " + errorElem.toString()); @@ -173,7 +183,6 @@ private void sendPostReq(String urlStr, DynatraceItem dti) throws IOException { LOGGER.warn("Invalid lines: <[{}]>", invalidElem); } - if (statusCode != 202 && statusCode != 400) { throw new RuntimeException("Error! Response code: <[" + statusCode + "]>. Expected 202 or 400."); } @@ -188,7 +197,8 @@ public void flush() { dynatraceItems.forEach(dti -> { try { sendPostReq(metricsApiUrl, dti); - } catch (IOException e) { + } + catch (IOException e) { throw new RuntimeException("Error sending post request: " + e); } }); diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsDeleteStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsDeleteStep.java index e53c41d..7c11026 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsDeleteStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsDeleteStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -84,7 +83,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { String reason = "Unknown failure"; Dataset generated; try { - org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.get(catCtx.getSparkSession().sparkContext().hadoopConfiguration()); + org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem + .get(catCtx.getSparkSession().sparkContext().hadoopConfiguration()); org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(pathStr); if (fs.exists(path)) { @@ -103,15 +103,12 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { Row r = RowFactory.create(pathStr, "delete", String.valueOf(success), reason); - final StructType schema = - new StructType( - new StructField[] { - new StructField("path", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("operation", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("success", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("reason", DataTypes.StringType, true, new MetadataBuilder().build()) - } - ); + final StructType schema = new StructType(new StructField[] { + new StructField("path", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("operation", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("success", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("reason", DataTypes.StringType, true, new MetadataBuilder().build()) + }); // make a streaming dataset SparkSession ss = catCtx.getSparkSession(); @@ -122,10 +119,11 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { generated = rowMemoryStream.toDS(); // create hdfs writer and query - final String queryName = "delete_hdfs_file" + ((int)(Math.random() * 100000)); - DataStreamWriter deleteHdfsWriter = generated. - writeStream().outputMode("append").format("memory"); - StreamingQuery deleteHdfsQuery = catCtx.getInternalStreamingQueryListener().registerQuery(queryName, deleteHdfsWriter); + final String queryName = "delete_hdfs_file" + ((int) (Math.random() * 100000)); + DataStreamWriter deleteHdfsWriter = generated.writeStream().outputMode("append").format("memory"); + StreamingQuery deleteHdfsQuery = catCtx + .getInternalStreamingQueryListener() + .registerQuery(queryName, deleteHdfsWriter); // add all the generated data to the memory stream rowMemoryStream.addData(JavaConversions.asScalaBuffer(Collections.singletonList(r))); @@ -133,7 +131,6 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // wait for it to be done and then return it deleteHdfsQuery.awaitTermination(); - return generated; } } diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsListStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsListStep.java index c21246d..b968050 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsListStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsListStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -97,8 +96,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // no path specified, get user's home directory pathStr = "/user/" + catCtx.getSparkSession().sparkContext().sparkUser(); } - org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.get( - catCtx.getSparkSession().sparkContext().hadoopConfiguration()); + org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem + .get(catCtx.getSparkSession().sparkContext().hadoopConfiguration()); org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(pathStr); FileStatus[] fileStatuses = fs.globStatus(path); @@ -127,29 +126,27 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { String size = twoDecimals.format((fileStatus.getLen() / 1024d)) + "K"; // create row containing the file info and add it to the listOfRows - Row r = RowFactory.create( - filePerms, fileOwner, size, fileModDate, fileAccDate, fileName, filePath, type); + Row r = RowFactory + .create(filePerms, fileOwner, size, fileModDate, fileAccDate, fileName, filePath, type); listOfRows.add(r); } - } else { + } + else { // no files found listOfRows.add(RowFactory.create(null, null, null, null, null, null, null, null)); } // schema for the created rows - final StructType schema = - new StructType( - new StructField[]{ - new StructField("permissions", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("owner", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("size", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("modificationDate", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("accessDate", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("name", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("path", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("type", DataTypes.StringType, true, new MetadataBuilder().build()) - } - ); + final StructType schema = new StructType(new StructField[] { + new StructField("permissions", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("owner", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("size", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("modificationDate", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("accessDate", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("name", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("path", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("type", DataTypes.StringType, true, new MetadataBuilder().build()) + }); // make a streaming dataset SparkSession ss = catCtx.getSparkSession(); @@ -161,9 +158,10 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // create hdfs writer and query final String queryName = "list_hdfs_files_" + ((int) (Math.random() * 100000)); - DataStreamWriter listHdfsWriter = generated. - writeStream().outputMode("append").format("memory"); - StreamingQuery listHdfsQuery = catCtx.getInternalStreamingQueryListener().registerQuery(queryName, listHdfsWriter); + DataStreamWriter listHdfsWriter = generated.writeStream().outputMode("append").format("memory"); + StreamingQuery listHdfsQuery = catCtx + .getInternalStreamingQueryListener() + .registerQuery(queryName, listHdfsWriter); // add all the generated data to the memory stream rowMemoryStream.addData(JavaConversions.asScalaBuffer(listOfRows)); @@ -171,9 +169,13 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // wait for it to be done and then return it listHdfsQuery.awaitTermination(); - } catch (FileNotFoundException fnfe) { - throw new RuntimeException("Specified path '" + pathStr + "' could not be found. Check that the path is written correctly."); - } catch (IOException e) { + } + catch (FileNotFoundException fnfe) { + throw new RuntimeException( + "Specified path '" + pathStr + "' could not be found. Check that the path is written correctly." + ); + } + catch (IOException e) { throw new RuntimeException(e); } // filter null-name rows out diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsLoadStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsLoadStep.java index 04bd176..d617438 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsLoadStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsLoadStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -74,6 +73,7 @@ * Teragrep exec hdfs load: Load avro-formatted data from disk to memory */ public final class TeragrepHdfsLoadStep extends TeragrepHdfsStep { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepHdfsLoadStep.class); private final DPLParserCatalystContext catCtx; public final String pathStr; @@ -85,7 +85,13 @@ public enum Format { CSV, JSON, AVRO } - public TeragrepHdfsLoadStep(DPLParserCatalystContext catCtx, String pathStr, Format format, boolean header, String schema) { + public TeragrepHdfsLoadStep( + DPLParserCatalystContext catCtx, + String pathStr, + Format format, + boolean header, + String schema + ) { this.catCtx = catCtx; this.pathStr = pathStr; this.format = format; @@ -98,8 +104,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { Dataset rv = null; try { // get hadoop fs - org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.get( - catCtx.getSparkSession().sparkContext().hadoopConfiguration()); + org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem + .get(catCtx.getSparkSession().sparkContext().hadoopConfiguration()); // first check if there is any wildcards if (pathStr.contains("*")) { // wildcard * char present @@ -110,7 +116,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { if (rv == null) { rv = processHdfsLoad(sPath, fs, false, schema); - } else { + } + else { Dataset res = processHdfsLoad(sPath, fs, false, schema); if (res != null) { rv = rv.union(res); @@ -118,20 +125,22 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { } } - } else { + } + else { // no wildcard char present LOGGER.info("HDFS Load did not find a wildcard char, loading as single path"); rv = processHdfsLoad(pathStr, fs, true, schema); } - - } catch (IOException ioe) { + } + catch (IOException ioe) { throw new RuntimeException(ioe); } if (rv == null) { - throw new RuntimeException("HDFS Load did not find any valid data in the given path, please double-check " + - "the path."); + throw new RuntimeException( + "HDFS Load did not find any valid data in the given path, please double-check " + "the path." + ); } return rv; @@ -139,11 +148,13 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { /** * Used to process each of the paths and return the loaded dataset + * * @param pathStr HDFS path to the folder containing the 'metadata.dpl' file and '/data' folder. - * @param fs Hadoop FS object + * @param fs Hadoop FS object * @return loaded data as Dataset */ - private Dataset processHdfsLoad(String pathStr, FileSystem fs, boolean isSinglePath, String csvSchema) throws StreamingQueryException { + private Dataset processHdfsLoad(String pathStr, FileSystem fs, boolean isSinglePath, String csvSchema) + throws StreamingQueryException { // read metadata first HdfsSaveMetadata metadata; StructType schema = new StructType(); @@ -166,13 +177,18 @@ private Dataset processHdfsLoad(String pathStr, FileSystem fs, boolean isSi wasStreaming = metadata.getWasStreamingDataset(); originalSchema = metadata.getOriginalSchema(); mapOfAvroNames = metadata.getMapOfAvroColumnNames(); - } else if (format == Format.AVRO && isSinglePath) { - throw new RuntimeException("Could not find metadata in the specified path. Double-check the given path."); - } else if (format == Format.AVRO) { + } + else if (format == Format.AVRO && isSinglePath) { + throw new RuntimeException( + "Could not find metadata in the specified path. Double-check the given path." + ); + } + else if (format == Format.AVRO) { throw new RuntimeException("Path '" + pathStr + "' did not contain the necessary metadata."); } - } catch (IOException e) { + } + catch (IOException e) { throw new RuntimeException(e); } @@ -187,15 +203,23 @@ private Dataset processHdfsLoad(String pathStr, FileSystem fs, boolean isSi if (wasStreaming) { // Standard streaming dataset, e.g. no aggregations or forEachBatch mode when saved return ss.readStream().format("avro").schema(schema).load(pathStr.concat("/data")); - } else { + } + else { // Non-streaming dataset, e.g. aggregations or forEachBatch mode when saved. // read json dataset - Dataset jsonDataset = ss.readStream().format("avro").schema(schema).load(pathStr.concat("/data")); + Dataset jsonDataset = ss + .readStream() + .format("avro") + .schema(schema) + .load(pathStr.concat("/data")); // explode into '$$dpl_internal_json_table$$' column - Dataset explodedJsonDs = jsonDataset.withColumn("$$dpl_internal_json_table$$", - functions.explode(functions.from_json(functions.col("value"), DataTypes.createArrayType(DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType))))); + Dataset explodedJsonDs = jsonDataset + .withColumn( + "$$dpl_internal_json_table$$", functions + .explode(functions.from_json(functions.col("value"), DataTypes.createArrayType(DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType)))) + ); // get original column names and prefix with '$$dpl_internal_json_table$$.' to access them final List jsonTableFields = new ArrayList<>(); @@ -204,11 +228,13 @@ private Dataset processHdfsLoad(String pathStr, FileSystem fs, boolean isSi } // select only prefixed columns - Seq selectCols = JavaConversions.asScalaBuffer(jsonTableFields.stream().map(functions::col).collect(Collectors.toList())); + Seq selectCols = JavaConversions + .asScalaBuffer(jsonTableFields.stream().map(functions::col).collect(Collectors.toList())); explodedJsonDs = explodedJsonDs.select(selectCols); return explodedJsonDs; } - } else { + } + else { // new avro-friendly save // load data from disk Dataset out = ss.readStream().format("avro").schema(schema).load(pathStr.concat("/data")); @@ -221,34 +247,42 @@ private Dataset processHdfsLoad(String pathStr, FileSystem fs, boolean isSi // get timechart span if '_time' column is present for (StructField field : out.schema().fields()) { if (field.name().equals("_time")) { - LOGGER.info("Found '_time' column in HDFS load data, reading min and max for timechart range calculation."); + LOGGER + .info( + "Found '_time' column in HDFS load data, reading min and max for timechart range calculation." + ); AtomicLong earliest = new AtomicLong(Long.MAX_VALUE); AtomicLong latest = new AtomicLong(Long.MIN_VALUE); - DataStreamWriter dsw = out - .writeStream() - .foreachBatch((ds, i) -> { - if (!ds.isEmpty()) { - final long newEarliest = ds.agg(functions.min("_time")).first().getTimestamp(0).getTime() / 1000L; - if (earliest.get() > newEarliest) { - LOGGER.debug("Set default earliest: <{}>", newEarliest); - earliest.set(newEarliest); - } - - final long newLatest = ds.agg(functions.max("_time")).first().getTimestamp(0).getTime() / 1000L; - if (latest.get() < newLatest) { - LOGGER.debug("Set default latest: <{}>", newLatest); - latest.set(newLatest); - } - } else { - LOGGER.info("Avro file was empty, returning an empty dataset."); - } - }); - - StreamingQuery sq = catCtx.getInternalStreamingQueryListener().registerQuery(String.valueOf(UUID.randomUUID()), dsw); + DataStreamWriter dsw = out.writeStream().foreachBatch((ds, i) -> { + if (!ds.isEmpty()) { + final long newEarliest = ds + .agg(functions.min("_time")) + .first() + .getTimestamp(0) + .getTime() / 1000L; + if (earliest.get() > newEarliest) { + LOGGER.debug("Set default earliest: <{}>", newEarliest); + earliest.set(newEarliest); + } + + final long newLatest = ds.agg(functions.max("_time")).first().getTimestamp(0).getTime() + / 1000L; + if (latest.get() < newLatest) { + LOGGER.debug("Set default latest: <{}>", newLatest); + latest.set(newLatest); + } + } + else { + LOGGER.info("Avro file was empty, returning an empty dataset."); + } + }); + + StreamingQuery sq = catCtx + .getInternalStreamingQueryListener() + .registerQuery(String.valueOf(UUID.randomUUID()), dsw); sq.awaitTermination(); - catCtx.setDplMinimumEarliest(earliest.get()); catCtx.setDplMaximumLatest(latest.get()); break; @@ -257,50 +291,53 @@ private Dataset processHdfsLoad(String pathStr, FileSystem fs, boolean isSi return out; } - } else if (format == Format.CSV) { - // Standard csv format streaming dataset - String fileFormat = "csv"; - DataStreamReader reader = ss.readStream(); - if (header) { - reader = reader.option("header", "true"); - } else { - reader = reader.option("header", "false"); - } + } + else if (format == Format.CSV) { + // Standard csv format streaming dataset + String fileFormat = "csv"; + DataStreamReader reader = ss.readStream(); + if (header) { + reader = reader.option("header", "true"); + } + else { + reader = reader.option("header", "false"); + } - // prioritize schema given in command - if (csvSchema != null && !csvSchema.isEmpty()) { - reader = reader.schema(generateSchemaFromCsvHeader(csvSchema)); - } else if (schema != null && !schema.isEmpty()) { - // schema from metadata - reader = reader.schema(schema); - } else { - // no schema, load all in _raw column - // read as plain text to ignore delimiters - fileFormat = "text"; - reader = reader.schema(new StructType( - new StructField[] - { - new StructField("_raw", DataTypes.StringType, true, - new MetadataBuilder().build()) - })); - } + // prioritize schema given in command + if (csvSchema != null && !csvSchema.isEmpty()) { + reader = reader.schema(generateSchemaFromCsvHeader(csvSchema)); + } + else if (schema != null && !schema.isEmpty()) { + // schema from metadata + reader = reader.schema(schema); + } + else { + // no schema, load all in _raw column + // read as plain text to ignore delimiters + fileFormat = "text"; + reader = reader.schema(new StructType(new StructField[] { + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()) + })); + } - // files saved with HDFS save use a directory-based path - // 3rd party files may be single .csv files - if (pathStr.endsWith(".csv")) { - // append wildcard to make it a directory, structured streaming requires it - return reader.format(fileFormat).load(pathStr.concat("*")); - } else { - return reader.format(fileFormat).load(pathStr.concat("/data")); - } - } else { - throw new IllegalArgumentException("Format '" + format + "' is not supported."); + // files saved with HDFS save use a directory-based path + // 3rd party files may be single .csv files + if (pathStr.endsWith(".csv")) { + // append wildcard to make it a directory, structured streaming requires it + return reader.format(fileFormat).load(pathStr.concat("*")); + } + else { + return reader.format(fileFormat).load(pathStr.concat("/data")); + } + } + else { + throw new IllegalArgumentException("Format '" + format + "' is not supported."); } } /** - * Generate a Spark-compatible schema from a comma-separated header - * "a, b, c, d" + * Generate a Spark-compatible schema from a comma-separated header "a, b, c, d" + * * @param csvHeader CSV-style header schema * @return StructType containing the same schema. All as StringType. */ @@ -323,4 +360,4 @@ private StructType generateSchemaFromCsvHeader(final String csvHeader) { return new StructType(structFields); } -} \ No newline at end of file +} diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsSaveStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsSaveStep.java index 4ced239..1955f1e 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsSaveStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsSaveStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -67,6 +66,7 @@ * teragrep exec hdfs save: Save dataset to disk in avro format */ public final class TeragrepHdfsSaveStep extends TeragrepHdfsStep { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepHdfsSaveStep.class); private final DPLParserCatalystContext catCtx; @@ -75,11 +75,19 @@ public final class TeragrepHdfsSaveStep extends TeragrepHdfsStep { public final String retentionSpan; public final Format format; public final boolean header; + public enum Format { CSV, JSON, AVRO } - public TeragrepHdfsSaveStep(DPLParserCatalystContext catCtx, boolean overwrite, String pathStr, String retentionSpan, Format format, boolean header) { + public TeragrepHdfsSaveStep( + DPLParserCatalystContext catCtx, + boolean overwrite, + String pathStr, + String retentionSpan, + Format format, + boolean header + ) { super(); this.catCtx = catCtx; this.overwrite = overwrite; @@ -110,7 +118,10 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // If it doesn't exist, continue as-is. if (fs.exists(fsPath) && fs.isDirectory(fsPath)) { if (overwrite) { - LOGGER.info("TG HDFS Save: Pre-existing data was found in specified path. Deleting pre-existing data."); + LOGGER + .info( + "TG HDFS Save: Pre-existing data was found in specified path. Deleting pre-existing data." + ); // path=fsPath, recursive=true fs.delete(fsPath, true); @@ -133,13 +144,22 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { previousSaveWasStreaming = metadata.getWasStreamingDataset(); } - if (applicationId != null && applicationId.equals(catCtx.getSparkSession().sparkContext().applicationId()) - && paragraphId != null && paragraphId.equals(catCtx.getParagraphUrl()) && !previousSaveWasStreaming) { + if ( + applicationId != null && applicationId + .equals(catCtx.getSparkSession().sparkContext().applicationId()) && paragraphId != null + && paragraphId.equals(catCtx.getParagraphUrl()) && !previousSaveWasStreaming + ) { // appId matches last save and was not streaming (=aggregated); allow to overwrite // this is due to sequential mode visiting this multiple times -> metadata will exist after first batch and overwrite=false would block rest of the batches! - LOGGER.info("Previous HDFS save to this path was not streaming and appId matches last save; allowing overwrite and bypassing overwrite=false parameter."); - } else { - throw new RuntimeException("The specified path '" + pathStr + "' already exists, please select another path."); + LOGGER + .info( + "Previous HDFS save to this path was not streaming and appId matches last save; allowing overwrite and bypassing overwrite=false parameter." + ); + } + else { + throw new RuntimeException( + "The specified path '" + pathStr + "' already exists, please select another path." + ); } } @@ -170,18 +190,19 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { final Map mapOfColumnNames = new HashMap<>(); for (final StructField field : dataset.schema().fields()) { // avro-friendly column names conversion - final String encodedName = "HEX".concat(Hex.encodeHexString(field.name().getBytes(StandardCharsets.UTF_8))); + final String encodedName = "HEX" + .concat(Hex.encodeHexString(field.name().getBytes(StandardCharsets.UTF_8))); convertedDataset = convertedDataset.withColumnRenamed(field.name(), encodedName); mapOfColumnNames.put(encodedName, field.name()); } metadata.setMapOfAvroColumnNames(mapOfColumnNames); metadata.setSchema(convertedDataset.schema()); - } else { + } + else { metadata.setSchema(dataset.schema()); } - // serialize and write to hdfs byte[] mdataArray = serializeMetadata(metadata); @@ -191,7 +212,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { out.write(mdataArray); out.close(); - } catch (IOException e) { + } + catch (IOException e) { throw new RuntimeException("Saving metadata object failed due to: \n" + e); } @@ -207,7 +229,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { .mode(this.aggregatesUsedBefore ? SaveMode.Overwrite : SaveMode.Append) .option("checkpointLocation", cpPath) .option("path", pathStr.concat("/data")); - } else if (format == Format.CSV) { + } + else if (format == Format.CSV) { hdfsSaveWriter = dataset .write() .format("csv") @@ -215,7 +238,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { .option("checkpointLocation", cpPath) .option("header", header) .option("path", pathStr.concat("/data")); - } else { + } + else { throw new IllegalArgumentException("Format '" + format + "' is not supported."); } @@ -237,7 +261,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { .option("path", pathStr.concat("/data")) .option("checkpointLocation", cpPath) .outputMode(OutputMode.Append()); - } else if (format == Format.CSV) { + } + else if (format == Format.CSV) { hdfsSaveWriter = dataset .repartition(1) .writeStream() @@ -247,12 +272,15 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { .option("checkpointLocation", cpPath) .option("header", header) .outputMode(OutputMode.Append()); - } else { + } + else { throw new IllegalArgumentException("Format '" + format + "' is not supported."); } // check for query completion - StreamingQuery hdfsSaveQuery = catCtx.getInternalStreamingQueryListener().registerQuery(queryName, hdfsSaveWriter); + StreamingQuery hdfsSaveQuery = catCtx + .getInternalStreamingQueryListener() + .registerQuery(queryName, hdfsSaveWriter); // await for listener to stop the hdfsSaveQuery hdfsSaveQuery.awaitTermination(); diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsStep.java index 304466b..5e2ee5c 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepHdfsStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep; import com.teragrep.pth10.ast.commands.transformstatement.teragrep.HdfsSaveMetadata; @@ -59,18 +58,22 @@ public TeragrepHdfsStep() { /** * Serializes HdfsSaveMetadata to a byte array + * * @param metadata input metadata * @return serialized as byte array */ byte[] serializeMetadata(HdfsSaveMetadata metadata) { byte[] serialized; - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { + try ( + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos) + ) { oos.writeObject(metadata); serialized = baos.toByteArray(); - } catch (IOException e) { + } + catch (IOException e) { throw new RuntimeException("Error serializing metadata object: " + e); } @@ -79,17 +82,20 @@ byte[] serializeMetadata(HdfsSaveMetadata metadata) { /** * Deserializes a byte array into a HdfsSaveMetadata object + * * @param serialized byte array * @return deserialized metadata object */ HdfsSaveMetadata deserializeMetadata(byte[] serialized) { HdfsSaveMetadata deserialized; - try (ByteArrayInputStream bais = new ByteArrayInputStream(serialized); - DecompressibleInputStream dis = new DecompressibleInputStream(bais)) { + try ( + ByteArrayInputStream bais = new ByteArrayInputStream(serialized); DecompressibleInputStream dis = new DecompressibleInputStream(bais) + ) { deserialized = (HdfsSaveMetadata) dis.readObject(); - } catch (IOException | ClassNotFoundException e) { + } + catch (IOException | ClassNotFoundException e) { throw new RuntimeException("Error deserializing metadata object: " + e); } diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepKafkaStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepKafkaStep.java index f5aa136..cdc9d7b 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepKafkaStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepKafkaStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep; import com.google.gson.Gson; @@ -93,7 +92,12 @@ public final class TeragrepKafkaStep extends AbstractStep implements Flushable { private final static String KAFKA_SECURITY_PROTOCOL_CONFIG_ITEM = "dpl.pth_10.transform.teragrep.kafka.save.security.protocol"; private final static String DEFAULT_KAFKA_TOPIC_TEMPLATE = "teragrep.%s.%s"; - public TeragrepKafkaStep(String hdfsPath, DPLParserCatalystContext catCtx, Config zeppelinConfig, String kafkaTopic) { + public TeragrepKafkaStep( + String hdfsPath, + DPLParserCatalystContext catCtx, + Config zeppelinConfig, + String kafkaTopic + ) { this.hdfsPath = hdfsPath; this.catCtx = catCtx; this.zeppelinConfig = zeppelinConfig; @@ -107,7 +111,8 @@ public void flush() { // last version of the whole dataset has to be saved to kafka now. try { this.dfKafkaWriter.save(); - } catch (Exception e) { + } + catch (Exception e) { throw new RuntimeException("Error saving dataframe: " + e); } } @@ -130,7 +135,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // exists, but null or empty string throw new RuntimeException("Identity was null"); } - } else { + } + else { // no config item throw new RuntimeException("Missing configuration item: '" + FALLBACK_S3_IDENTITY_CONFIG_ITEM + "'."); } @@ -141,17 +147,20 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // exists, but null or empty string throw new RuntimeException("Credential was null"); } - } else { + } + else { // no config item throw new RuntimeException("Missing configuration item: '" + FALLBACK_S3_CREDENTIAL_CONFIG_ITEM + "'."); } - } else { + } + else { // ignore anything after '@' char in username identity = identity.split("@")[0]; } // set jaas config string based on identity & credential - final String jaasConfig = "org.apache.kafka.common.security.plain.PlainLoginModule required username=\"" + identity + "\" password=\"" + credential + "\";"; + final String jaasConfig = "org.apache.kafka.common.security.plain.PlainLoginModule required username=\"" + + identity + "\" password=\"" + credential + "\";"; if (kafkaTopic == null || kafkaTopic.equals("")) { // default kafka topic if not one specified by user @@ -171,17 +180,24 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { if (toKafkaDs.isStreaming()) { // parallel mode // Convert each row to a single "value" column as JSON - toKafkaDs = toKafkaDs.map((MapFunction) r -> { - // get column names as a scala sequence - Seq seqOfColumnNames = JavaConversions.asScalaBuffer(Arrays.asList(r.schema().fieldNames())); - - // Get values for each of the columns as a map, and create json out of the map - // getValuesMap() returns a Scala map; mapAsJavaMap converts it to a java map which Gson processes correctly - String json = new Gson().toJson(JavaConversions.mapAsJavaMap(r.getValuesMap(seqOfColumnNames))); - - // Return final row - return RowFactory.create(json); - }, RowEncoder.apply(new StructType(new StructField[]{new StructField("value", DataTypes.StringType, true, new MetadataBuilder().build())}))); + toKafkaDs = toKafkaDs + .map( + (MapFunction) r -> { + // get column names as a scala sequence + Seq seqOfColumnNames = JavaConversions + .asScalaBuffer(Arrays.asList(r.schema().fieldNames())); + + // Get values for each of the columns as a map, and create json out of the map + // getValuesMap() returns a Scala map; mapAsJavaMap converts it to a java map which Gson processes correctly + String json = new Gson() + .toJson(JavaConversions.mapAsJavaMap(r.getValuesMap(seqOfColumnNames))); + + // Return final row + return RowFactory.create(json); + }, RowEncoder.apply(new StructType(new StructField[] { + new StructField("value", DataTypes.StringType, true, new MetadataBuilder().build()) + })) + ); } else { // sequential mode, all as one event List jsonList = toKafkaDs.toJSON().collectAsList(); @@ -194,8 +210,11 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { json = Arrays.toString(jsonList.toArray()); } - toKafkaDs = catCtx.getSparkSession().createDataFrame(Collections.singletonList(RowFactory.create(json)), - new StructType(new StructField[]{new StructField("value", DataTypes.StringType, true, new MetadataBuilder().build())})); + toKafkaDs = catCtx + .getSparkSession() + .createDataFrame(Collections.singletonList(RowFactory.create(json)), new StructType(new StructField[] { + new StructField("value", DataTypes.StringType, true, new MetadataBuilder().build()) + })); } String kafkaBootstrapServers; @@ -210,7 +229,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // exists, but null or empty string throw new RuntimeException("Kafka save bootstrap servers config not properly set."); } - } else { + } + else { // config item does not exist at all throw new RuntimeException("Missing configuration item: '" + KAFKA_BOOTSTRAP_SERVERS_CONFIG_ITEM + "'."); } @@ -221,7 +241,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // exists, but null or empty string throw new RuntimeException("Kafka save sasl mechanism config not properly set."); } - } else { + } + else { // config item does not exist at all throw new RuntimeException("Missing configuration item: '" + KAFKA_SASL_MECHANISM_CONFIG_ITEM + "'."); } @@ -232,7 +253,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // exists, but null or empty string throw new RuntimeException("Kafka save security protocol config not properly set."); } - } else { + } + else { // config item does not exist at all throw new RuntimeException("Missing configuration item: '" + KAFKA_SECURITY_PROTOCOL_CONFIG_ITEM + "'."); } @@ -243,7 +265,8 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // exists, but null or empty string throw new RuntimeException("S3 endpoint config not properly set."); } - } else { + } + else { // config item does not exist at all throw new RuntimeException("Missing configuration item: '" + S3_CREDENTIAL_ENDPOINT_CONFIG_ITEM + "'."); } @@ -290,12 +313,15 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { if (catCtx.getStepList().getAggregateCount() > 0) { kafkaWriter.outputMode(OutputMode.Complete()); - } else { + } + else { // should always be this, since aggregates should be in sequential kafkaWriter.outputMode(OutputMode.Append()); } - StreamingQuery kafkaQuery = catCtx.getInternalStreamingQueryListener().registerQuery(kafkaQueryName, kafkaWriter); + StreamingQuery kafkaQuery = catCtx + .getInternalStreamingQueryListener() + .registerQuery(kafkaQueryName, kafkaWriter); // Await for the listener to send the stop signal kafkaQuery.awaitTermination(); diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepSyslogStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepSyslogStep.java index 91fc87a..9fb72e0 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepSyslogStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepSyslogStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep; import com.teragrep.pth10.ast.commands.transformstatement.teragrep.SyslogStreamer; @@ -54,10 +53,11 @@ import org.slf4j.LoggerFactory; /** - * teragrep exec syslog stream host x port y - * Sends the topmost dataset popped from the ProcessingStack as a series of syslog messages to the given RELP server. + * teragrep exec syslog stream host x port y Sends the topmost dataset popped from the ProcessingStack as a + * series of syslog messages to the given RELP server. */ public class TeragrepSyslogStep extends AbstractStep { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepSyslogStep.class); public final String relpHost; diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepSystemStep.java b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepSystemStep.java index 7f4c811..fed565c 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepSystemStep.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/TeragrepSystemStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -62,10 +61,11 @@ import java.util.Properties; /** - * teragrep get system version: - * Returns a dataset containing the various version numbers of the components used in Teragrep + * teragrep get system version: Returns a dataset containing the various version numbers of the components used in + * Teragrep */ public final class TeragrepSystemStep extends AbstractStep { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepSystemStep.class); private final DPLParserCatalystContext catCtx; @@ -89,15 +89,19 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { GeneratedDatasource datasource = new GeneratedDatasource(catCtx); try { dataset = datasource.constructStream(versions, explainStr); - } catch (InterruptedException | UnknownHostException e) { + } + catch (InterruptedException | UnknownHostException e) { throw new RuntimeException(e); } } else { // getComponentVersions() requires jar packaging - LOGGER.error("Teragrep get system version: Versions list was NULL, meaning the version properties could " + - "not be fetched. This might be caused by running this command in a development environment."); + LOGGER + .error( + "Teragrep get system version: Versions list was NULL, meaning the version properties could " + + "not be fetched. This might be caused by running this command in a development environment." + ); } return dataset; @@ -105,13 +109,18 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { /** * Gets the various Teragrep component versions + * * @return component versions as a list */ private List getComponentVersions() { final List rv = new ArrayList<>(); LOGGER.info("programmatically resolved package: <{}>", TeragrepSystemStep.class.getPackage()); - LOGGER.info("programmatically resolved: <{}>", TeragrepSystemStep.class.getPackage().getImplementationVersion()); + LOGGER + .info( + "programmatically resolved: <{}>", + TeragrepSystemStep.class.getPackage().getImplementationVersion() + ); java.io.InputStream is = TeragrepSystemStep.class.getClassLoader().getResourceAsStream("maven.properties"); java.util.Properties p = new Properties(); @@ -128,7 +137,8 @@ private List getComponentVersions() { rv.add(splitProperty[1] + " version: " + p.getProperty(property)); } }); - } catch (IOException | NullPointerException e) { + } + catch (IOException | NullPointerException e) { e.printStackTrace(); } return rv; diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/BloomFilterForeachPartitionFunction.java b/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/BloomFilterForeachPartitionFunction.java index 0864ce9..855a5b6 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/BloomFilterForeachPartitionFunction.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/BloomFilterForeachPartitionFunction.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022, 2023 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep.bloomfilter; import com.typesafe.config.Config; @@ -64,6 +63,7 @@ public BloomFilterForeachPartitionFunction(Config config, boolean overwrite) { this.lazyConnection = new LazyConnection(config); this.overwrite = overwrite; } + public BloomFilterForeachPartitionFunction(Config config) { this.filterSizes = new FilterSizes(config); this.lazyConnection = new LazyConnection(config); @@ -81,8 +81,7 @@ public void call(Iterator iter) throws Exception { String partition = row.getString(0); byte[] filterBytes = (byte[]) row.get(1); - TeragrepBloomFilter filter = - new TeragrepBloomFilter(partition, filterBytes, conn, filterSizes); + TeragrepBloomFilter filter = new TeragrepBloomFilter(partition, filterBytes, conn, filterSizes); filter.saveFilter(overwrite); conn.commit(); diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/FilterSizes.java b/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/FilterSizes.java index e8fdf19..1679345 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/FilterSizes.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/FilterSizes.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022, 2023 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep.bloomfilter; import com.google.gson.Gson; @@ -60,11 +59,12 @@ import static com.teragrep.pth10.steps.teragrep.TeragrepBloomStep.BLOOM_NUMBER_OF_FIELDS_CONFIG_ITEM; public class FilterSizes implements Serializable { + private static final Logger LOGGER = LoggerFactory.getLogger(FilterSizes.class); private final Config config; - private final ArrayList> mapCache = new ArrayList<>(1); - private final ArrayList> bitSizeMapCache = new ArrayList<>(1); + private final ArrayList> mapCache = new ArrayList<>(1); + private final ArrayList> bitSizeMapCache = new ArrayList<>(1); public FilterSizes(Config config) { this.config = config; @@ -73,8 +73,8 @@ public FilterSizes(Config config) { /** * Filter sizes as sorted map *

- * Keys = filter expected num of items, - * values = filter FPP + * Keys = filter expected num of items, values = filter FPP + * * @return SortedMap of filter configuration */ public SortedMap asSortedMap() { @@ -104,10 +104,8 @@ private SortedMap mapFromConfig() { SortedMap sizesMapFromJson = new TreeMap<>(); Gson gson = new Gson(); - List jsonArray = gson.fromJson( - sizesJsonString(), - new TypeToken>(){}.getType() - ); + List jsonArray = gson.fromJson(sizesJsonString(), new TypeToken>() { + }.getType()); for (JsonObject object : jsonArray) { if (object.has("expected") && object.has("fpp")) { @@ -118,7 +116,8 @@ private SortedMap mapFromConfig() { throw new RuntimeException("Duplicate entry expected num of items"); } sizesMapFromJson.put(expectedNumOfItems, fpp); - } else { + } + else { throw new RuntimeException("JSON did not have expected values of 'expected' or 'fpp'"); } } @@ -132,7 +131,8 @@ private String sizesJsonString() { if (jsonString == null || jsonString.isEmpty()) { throw new RuntimeException("Bloom filter fields not configured."); } - } else { + } + else { throw new RuntimeException("Missing configuration item: '" + BLOOM_NUMBER_OF_FIELDS_CONFIG_ITEM + "'."); } return jsonString; diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/LazyConnection.java b/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/LazyConnection.java index 64439a3..0196091 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/LazyConnection.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/LazyConnection.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022, 2023 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep.bloomfilter; import java.io.Serializable; @@ -53,12 +52,12 @@ import com.typesafe.config.Config; - import static com.teragrep.pth10.steps.teragrep.TeragrepBloomStep.BLOOMDB_URL_CONFIG_ITEM; import static com.teragrep.pth10.steps.teragrep.TeragrepBloomStep.BLOOMDB_USERNAME_CONFIG_ITEM; import static com.teragrep.pth10.steps.teragrep.TeragrepBloomStep.BLOOMDB_PASSWORD_CONFIG_ITEM; public class LazyConnection implements Serializable { + private static Connection connection = null; private final Config config; @@ -71,16 +70,13 @@ public synchronized Connection get() { // lazy init String connectionURL = connectionURL(); String username = connectionUsername(); - String password= connectionPassword(); + String password = connectionPassword(); try { - connection = DriverManager.getConnection( - connectionURL, - username, - password - ); + connection = DriverManager.getConnection(connectionURL, username, password); - } catch (SQLException e) { + } + catch (SQLException e) { throw new RuntimeException(e); } } @@ -95,7 +91,8 @@ private String connectionUsername() { if (username == null || username.isEmpty()) { throw new RuntimeException("Database username not set."); } - } else { + } + else { throw new RuntimeException("Missing configuration item: '" + BLOOMDB_USERNAME_CONFIG_ITEM + "'."); } return username; @@ -109,7 +106,8 @@ private String connectionPassword() { if (password == null) { throw new RuntimeException("Database password not set."); } - } else { + } + else { throw new RuntimeException("Missing configuration item: '" + BLOOMDB_PASSWORD_CONFIG_ITEM + "'."); } return password; @@ -123,7 +121,8 @@ private String connectionURL() { if (databaseUrl == null || databaseUrl.isEmpty()) { throw new RuntimeException("Database url not set."); } - } else { + } + else { throw new RuntimeException("Missing configuration item: '" + BLOOMDB_URL_CONFIG_ITEM + "'."); } return databaseUrl; diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/TeragrepBloomFilter.java b/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/TeragrepBloomFilter.java index 69eb449..29f1023 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/TeragrepBloomFilter.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/bloomfilter/TeragrepBloomFilter.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022, 2023 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep.bloomfilter; import org.apache.spark.util.sketch.BloomFilter; @@ -65,6 +64,7 @@ * Class create a selected sized {@link BloomFilter} and run operations */ public class TeragrepBloomFilter { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepBloomFilter.class); private final String partitionID; @@ -74,7 +74,12 @@ public class TeragrepBloomFilter { private Long selectedExpectedNumOfItems; private Double selectedFpp; - public TeragrepBloomFilter(String partition, byte[] bloomfilterBytes, Connection connection, FilterSizes filterSizes) { + public TeragrepBloomFilter( + String partition, + byte[] bloomfilterBytes, + Connection connection, + FilterSizes filterSizes + ) { this.partitionID = partition; this.bloomfilterBytes = bloomfilterBytes; this.filterSizes = filterSizes; @@ -83,8 +88,8 @@ public TeragrepBloomFilter(String partition, byte[] bloomfilterBytes, Connection private BloomFilter sizedFilter() { - SortedMap filterSizesMap = filterSizes.asSortedMap(); - Map bitsizeToExpectedItemsMap = filterSizes.asBitsizeSortedMap(); + SortedMap filterSizesMap = filterSizes.asSortedMap(); + Map bitsizeToExpectedItemsMap = filterSizes.asBitsizeSortedMap(); try (ByteArrayInputStream bais = new ByteArrayInputStream(bloomfilterBytes)) { BloomFilter bf = BloomFilter.readFrom(bais); @@ -96,16 +101,19 @@ private BloomFilter sizedFilter() { this.selectedExpectedNumOfItems = expectedItems; this.selectedFpp = fpp; return bf; - } else { + } + else { throw new IllegalArgumentException("no such filterSize <[" + bitSize + "]>"); } - } catch (IOException e) { + } + catch (IOException e) { throw new RuntimeException(e); } } /** * Write filter bytes to database + * * @param overwriteExisting Set if existing filter data will be overwritten */ public void saveFilter(Boolean overwriteExisting) { @@ -115,8 +123,11 @@ public void saveFilter(Boolean overwriteExisting) { try (PreparedStatement stmt = connection.prepareStatement(sql)) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - LOGGER.info("Saving filter[expected: <{}> , fpp: <{}>] to bloomdb.bloomfilter, overwrite existing data: <{}>", - selectedExpectedNumOfItems, selectedFpp, overwriteExisting); + LOGGER + .info( + "Saving filter[expected: <{}> , fpp: <{}>] to bloomdb.bloomfilter, overwrite existing data: <{}>", + selectedExpectedNumOfItems, selectedFpp, overwriteExisting + ); filter.writeTo(baos); InputStream is = new ByteArrayInputStream(baos.toByteArray()); @@ -130,13 +141,16 @@ public void saveFilter(Boolean overwriteExisting) { is.close(); connection.commit(); - - } catch (IOException e) { + + } + catch (IOException e) { throw new RuntimeException("Error serializing data\n" + e); - } catch (SQLException e) { + } + catch (SQLException e) { throw new RuntimeException("Error writing to database\n" + e); } - } catch (SQLException e) { + } + catch (SQLException e) { throw new RuntimeException("Error generating a prepared statement\n" + e); } } @@ -144,11 +158,12 @@ public void saveFilter(Boolean overwriteExisting) { private static String sqlString(Boolean overwriteExisting) { final String sql; if (overwriteExisting) { - sql = "REPLACE INTO `bloomfilter` (`partition_id`, `filter_type_id`, `filter`) " + - "VALUES(?, (SELECT `id` FROM `filtertype` WHERE expectedElements=? AND targetFpp=?),?)"; - } else { - sql = "INSERT IGNORE INTO `bloomfilter` (`partition_id`, `filter_type_id`, `filter`) " + - "VALUES(?, (SELECT `id` FROM `filtertype` WHERE expectedElements=? AND targetFpp=?),?)"; + sql = "REPLACE INTO `bloomfilter` (`partition_id`, `filter_type_id`, `filter`) " + + "VALUES(?, (SELECT `id` FROM `filtertype` WHERE expectedElements=? AND targetFpp=?),?)"; + } + else { + sql = "INSERT IGNORE INTO `bloomfilter` (`partition_id`, `filter_type_id`, `filter`) " + + "VALUES(?, (SELECT `id` FROM `filtertype` WHERE expectedElements=? AND targetFpp=?),?)"; } return sql; } diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/dynatrace/DynatraceItem.java b/src/main/java/com/teragrep/pth10/steps/teragrep/dynatrace/DynatraceItem.java index 7ef75c0..738a5c0 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/dynatrace/DynatraceItem.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/dynatrace/DynatraceItem.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -52,6 +52,7 @@ import java.util.Map; public class DynatraceItem implements Serializable { + private String min; private String max; private String sum; @@ -61,6 +62,7 @@ public class DynatraceItem implements Serializable { private String metricKey; private Timestamp timestamp; private String dplQuery; + public DynatraceItem() { this.otherAggregates = new HashMap<>(); this.dimensions = new HashMap<>(); @@ -160,8 +162,11 @@ private String buildOutputString() { final String format = "%s.%s%s gauge,min=%s,max=%s,sum=%s,count=%s %s\n%s"; final String otherAggs = buildOtherAggregatesString(); final String dims = buildDimensionString(!otherAggs.isEmpty()); - final String output = String.format(format, metricKey, otherAggs, dims, - minString(), maxString(), sumString(), countString(), timestamp.toInstant().getEpochSecond() * 1000L, dtMeta); + final String output = String + .format( + format, metricKey, otherAggs, dims, minString(), maxString(), sumString(), countString(), + timestamp.toInstant().getEpochSecond() * 1000L, dtMeta + ); return output; } @@ -201,16 +206,17 @@ private String buildOtherAggregatesString() { return builder.toString(); } - private String getAggString(String aggName) { if (aggName == null) { if (!otherAggregates.isEmpty()) { Iterator it = otherAggregates.values().iterator(); return it.next(); - } else { + } + else { return "1"; } - } else { + } + else { return aggName; } } @@ -228,9 +234,10 @@ private String sumString() { } private String countString() { - if (count==null) { + if (count == null) { return "1"; - } else { + } + else { return count; } } diff --git a/src/main/java/com/teragrep/pth10/steps/teragrep/dynatrace/DynatraceMetadata.java b/src/main/java/com/teragrep/pth10/steps/teragrep/dynatrace/DynatraceMetadata.java index aa4ea2d..059119c 100644 --- a/src/main/java/com/teragrep/pth10/steps/teragrep/dynatrace/DynatraceMetadata.java +++ b/src/main/java/com/teragrep/pth10/steps/teragrep/dynatrace/DynatraceMetadata.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -48,6 +48,7 @@ import java.io.Serializable; public class DynatraceMetadata implements Serializable { + private static final long serialVersionUID = 1L; private String displayName; private String description; @@ -60,6 +61,7 @@ public DynatraceMetadata() { this.unit = "unspecified"; this.metricKey = ""; } + public DynatraceMetadata(String dispName, String metricKey) { this.displayName = dispName; this.description = "Query Statistics"; @@ -67,7 +69,6 @@ public DynatraceMetadata(String dispName, String metricKey) { this.metricKey = metricKey; } - public void setDescription(String description) { this.description = description; } @@ -102,18 +103,8 @@ public String getMetricKey() { @Override public String toString() { - final String s = '#' + - metricKey + - ' ' + - "gauge" + - ' ' + - "dt.meta.displayName=\"" + - displayName + - "\", dt.meta.description=\"" + - description + - "\", dt.meta.unit=\"" + - unit + - "\""; + final String s = '#' + metricKey + ' ' + "gauge" + ' ' + "dt.meta.displayName=\"" + displayName + + "\", dt.meta.description=\"" + description + "\", dt.meta.unit=\"" + unit + "\""; return s; } } diff --git a/src/main/java/com/teragrep/pth10/steps/timechart/AbstractTimechartStep.java b/src/main/java/com/teragrep/pth10/steps/timechart/AbstractTimechartStep.java index 9c71b14..28bf1c8 100644 --- a/src/main/java/com/teragrep/pth10/steps/timechart/AbstractTimechartStep.java +++ b/src/main/java/com/teragrep/pth10/steps/timechart/AbstractTimechartStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.timechart; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -53,6 +52,7 @@ import java.util.List; public abstract class AbstractTimechartStep extends AbstractStep { + protected List aggCols = null; protected List divByInsts = null; protected Column span = null; diff --git a/src/main/java/com/teragrep/pth10/steps/timechart/TimechartStep.java b/src/main/java/com/teragrep/pth10/steps/timechart/TimechartStep.java index 86e70f1..9849094 100644 --- a/src/main/java/com/teragrep/pth10/steps/timechart/TimechartStep.java +++ b/src/main/java/com/teragrep/pth10/steps/timechart/TimechartStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.timechart; import org.apache.spark.sql.*; @@ -55,6 +54,7 @@ import java.util.stream.Collectors; public final class TimechartStep extends AbstractTimechartStep { + public TimechartStep() { super(); this.properties.add(CommandProperty.AGGREGATE); @@ -72,14 +72,12 @@ public Dataset get(Dataset dataset) { // .agg has funky arguments; just giving a Seq of columns is no good, first arg needs to be a column Column firstAggCol = this.aggCols.get(0); - Seq seqOfAggColsExceptFirst = JavaConversions.asScalaBuffer(this.aggCols.subList(1, this.aggCols.size())); + Seq seqOfAggColsExceptFirst = JavaConversions + .asScalaBuffer(this.aggCols.subList(1, this.aggCols.size())); List allGroupBys = new ArrayList<>(); allGroupBys.add(this.span); - allGroupBys.addAll(this.divByInsts - .stream() - .map(functions::col) - .collect(Collectors.toList())); + allGroupBys.addAll(this.divByInsts.stream().map(functions::col).collect(Collectors.toList())); Seq seqOfAllGroupBys = JavaConversions.asScalaBuffer(allGroupBys); diff --git a/src/main/java/com/teragrep/pth10/steps/tokenizer/AbstractTokenizerStep.java b/src/main/java/com/teragrep/pth10/steps/tokenizer/AbstractTokenizerStep.java index d36c11b..527836e 100644 --- a/src/main/java/com/teragrep/pth10/steps/tokenizer/AbstractTokenizerStep.java +++ b/src/main/java/com/teragrep/pth10/steps/tokenizer/AbstractTokenizerStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,12 +43,12 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.tokenizer; import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractTokenizerStep extends AbstractStep { + public enum TokenizerFormat { STRING, BYTES } @@ -57,12 +57,14 @@ public enum TokenizerFormat { protected String outputCol = "tokens"; protected TokenizerFormat tokenizerFormat = TokenizerFormat.STRING; + public AbstractTokenizerStep() { super(); } /** * Sets the field for the tokenizer to run on + * * @param inputCol field name, defaults to '_raw' */ public void setInputCol(String inputCol) { @@ -71,6 +73,7 @@ public void setInputCol(String inputCol) { /** * Set Tokenizer output column + * * @param outputCol output column */ public void setOutputCol(String outputCol) { @@ -79,6 +82,7 @@ public void setOutputCol(String outputCol) { /** * Set whether to return byte array or string + * * @param tokenizerFormat format enum; string or bytes */ public void setTokenizerFormat(TokenizerFormat tokenizerFormat) { @@ -87,6 +91,7 @@ public void setTokenizerFormat(TokenizerFormat tokenizerFormat) { /** * Gets the field set for the tokenizer + * * @return field name used in the tokenizer, default '_raw' */ public String getInputCol() { @@ -95,6 +100,7 @@ public String getInputCol() { /** * Get tokenizer output column + * * @return output column */ public String getOutputCol() { @@ -103,6 +109,7 @@ public String getOutputCol() { /** * Get Tokenizer return type; bytes or string + * * @return bytes or string enum */ public TokenizerFormat getTokenizerFormat() { diff --git a/src/main/java/com/teragrep/pth10/steps/tokenizer/TokenizerStep.java b/src/main/java/com/teragrep/pth10/steps/tokenizer/TokenizerStep.java index c024705..5126d65 100644 --- a/src/main/java/com/teragrep/pth10/steps/tokenizer/TokenizerStep.java +++ b/src/main/java/com/teragrep/pth10/steps/tokenizer/TokenizerStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.tokenizer; import com.teragrep.functions.dpf_03.ByteArrayListAsStringListUDF; @@ -57,10 +56,10 @@ import static org.apache.spark.sql.types.DataTypes.StringType; /** - * Runs the dpf_03 TokenAggregator on given field - * - Returns a Row with type String[] + * Runs the dpf_03 TokenAggregator on given field - Returns a Row with type String[] */ public final class TokenizerStep extends AbstractTokenizerStep { + public TokenizerStep(AbstractTokenizerStep.TokenizerFormat tokenizerFormat, String inputCol, String outputCol) { super(); this.tokenizerFormat = tokenizerFormat; @@ -75,16 +74,20 @@ public Dataset get(Dataset dataset) { } // dpf_03 custom tokenizer udf - UserDefinedFunction tokenizerUDF = - functions.udf(new TokenizerUDF(), DataTypes.createArrayType(DataTypes.BinaryType, false)); - - + UserDefinedFunction tokenizerUDF = functions + .udf(new TokenizerUDF(), DataTypes.createArrayType(DataTypes.BinaryType, false)); if (this.tokenizerFormat == AbstractTokenizerStep.TokenizerFormat.BYTES) { return dataset.withColumn(this.getOutputCol(), tokenizerUDF.apply(functions.col(this.getInputCol()))); - } else if (this.tokenizerFormat == AbstractTokenizerStep.TokenizerFormat.STRING) { - UserDefinedFunction byteArrayListAsStringListUDF = functions.udf(new ByteArrayListAsStringListUDF(), DataTypes.createArrayType(StringType)); - return dataset.withColumn(this.getOutputCol(), byteArrayListAsStringListUDF.apply(tokenizerUDF.apply(functions.col(this.getInputCol())))); + } + else if (this.tokenizerFormat == AbstractTokenizerStep.TokenizerFormat.STRING) { + UserDefinedFunction byteArrayListAsStringListUDF = functions + .udf(new ByteArrayListAsStringListUDF(), DataTypes.createArrayType(StringType)); + return dataset + .withColumn( + this.getOutputCol(), + byteArrayListAsStringListUDF.apply(tokenizerUDF.apply(functions.col(this.getInputCol()))) + ); } throw new IllegalStateException("Unexpected tokenizerFormat: " + this.tokenizerFormat); diff --git a/src/main/java/com/teragrep/pth10/steps/top/AbstractTopStep.java b/src/main/java/com/teragrep/pth10/steps/top/AbstractTopStep.java index 3540577..f77e82b 100644 --- a/src/main/java/com/teragrep/pth10/steps/top/AbstractTopStep.java +++ b/src/main/java/com/teragrep/pth10/steps/top/AbstractTopStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.top; import com.teragrep.pth10.steps.AbstractStep; @@ -51,8 +50,10 @@ import java.util.List; public abstract class AbstractTopStep extends AbstractStep { + protected int limit = 10; private List listOfFields = null; + public AbstractTopStep() { super(); } diff --git a/src/main/java/com/teragrep/pth10/steps/top/TopStep.java b/src/main/java/com/teragrep/pth10/steps/top/TopStep.java index cfc23c8..767ef35 100644 --- a/src/main/java/com/teragrep/pth10/steps/top/TopStep.java +++ b/src/main/java/com/teragrep/pth10/steps/top/TopStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,13 +43,13 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.top; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -public final class TopStep extends AbstractTopStep{ +public final class TopStep extends AbstractTopStep { + public TopStep() { super(); } diff --git a/src/main/java/com/teragrep/pth10/steps/where/AbstractWhereStep.java b/src/main/java/com/teragrep/pth10/steps/where/AbstractWhereStep.java index c0f959a..6f77af7 100644 --- a/src/main/java/com/teragrep/pth10/steps/where/AbstractWhereStep.java +++ b/src/main/java/com/teragrep/pth10/steps/where/AbstractWhereStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,13 +43,13 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.where; import com.teragrep.pth10.steps.AbstractStep; import org.apache.spark.sql.Column; public abstract class AbstractWhereStep extends AbstractStep { + protected Column whereColumn = null; public AbstractWhereStep() { diff --git a/src/main/java/com/teragrep/pth10/steps/where/WhereStep.java b/src/main/java/com/teragrep/pth10/steps/where/WhereStep.java index 5f30092..257d3c8 100644 --- a/src/main/java/com/teragrep/pth10/steps/where/WhereStep.java +++ b/src/main/java/com/teragrep/pth10/steps/where/WhereStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,13 +43,13 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.where; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -public final class WhereStep extends AbstractWhereStep{ +public final class WhereStep extends AbstractWhereStep { + public WhereStep() { super(); } diff --git a/src/main/java/com/teragrep/pth10/steps/xmlkv/AbstractXmlkvStep.java b/src/main/java/com/teragrep/pth10/steps/xmlkv/AbstractXmlkvStep.java index 5c9d95a..fa20a48 100644 --- a/src/main/java/com/teragrep/pth10/steps/xmlkv/AbstractXmlkvStep.java +++ b/src/main/java/com/teragrep/pth10/steps/xmlkv/AbstractXmlkvStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -49,6 +49,7 @@ import com.teragrep.pth10.steps.AbstractStep; public abstract class AbstractXmlkvStep extends AbstractStep { + protected DPLParserCatalystContext catCtx; protected String field = "_raw"; protected int maxInputs = 50000; diff --git a/src/main/java/com/teragrep/pth10/steps/xmlkv/XmlkvStep.java b/src/main/java/com/teragrep/pth10/steps/xmlkv/XmlkvStep.java index 565d99f..807844d 100644 --- a/src/main/java/com/teragrep/pth10/steps/xmlkv/XmlkvStep.java +++ b/src/main/java/com/teragrep/pth10/steps/xmlkv/XmlkvStep.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -55,6 +55,7 @@ import java.util.Set; public class XmlkvStep extends AbstractXmlkvStep { + @Override public Dataset get(Dataset dataset) throws StreamingQueryException { final String mapColName = "$$dpl_internal_xmlkv_col$$"; @@ -73,18 +74,20 @@ public Dataset get(Dataset dataset) throws StreamingQueryException { // Check for nulls; return an empty string if null, otherwise value for given key for (String key : keys) { - dataset = dataset.withColumn( - key, - functions.when( - /* if key.value == null */ - functions.isnull(dataset.col(mapColName).getItem(key)), - /* then return empty string */ - functions.lit("")) - /* otherwise return key.value */ - .otherwise(dataset.col(mapColName).getItem(key))); + dataset = dataset + .withColumn( + key, functions + .when( + /* if key.value == null */ + functions.isnull(dataset.col(mapColName).getItem(key)), + /* then return empty string */ + functions.lit("") + ) + /* otherwise return key.value */ + .otherwise(dataset.col(mapColName).getItem(key)) + ); } - return dataset.drop(mapColName); } } diff --git a/src/test/java/com/teragrep/pth10/AccumTransformationStreamingTest.java b/src/test/java/com/teragrep/pth10/AccumTransformationStreamingTest.java index 781146f..07d497f 100644 --- a/src/test/java/com/teragrep/pth10/AccumTransformationStreamingTest.java +++ b/src/test/java/com/teragrep/pth10/AccumTransformationStreamingTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,98 +63,119 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class AccumTransformationStreamingTest { - private static final Logger LOGGER = LoggerFactory.getLogger(AccumTransformationStreamingTest.class); - private final String numberDataTestFile = "src/test/resources/numberData_0*.json"; // * to make the path into a directory path - private final String numberDataWithMixedStringsTestFile = "src/test/resources/numberData_withMixedStrings*.json"; - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); - - private StreamingTestUtil streamingTestUtil; - - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(this.testSchema); - this.streamingTestUtil.setEnv(); - } - - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } - - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } - - // ---------------------------------------- - // Tests - // ---------------------------------------- - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void accumBasicQueryTest() { - streamingTestUtil.performDPLTest( - "index=* | accum _raw", - numberDataTestFile, - ds -> { - List rawCol = ds.select("_raw").collectAsList().stream().map(r->r.getAs(0)).collect(Collectors.toList()); - List expected = Arrays.asList("-10", "-10", "0", "35", "82.2"); - assertTrue(rawCol.containsAll(expected)); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void accumRenameFieldQueryTest() { - streamingTestUtil.performDPLTest( - "index=* | accum _raw as new", - numberDataTestFile, - ds -> { - List newCol = ds.select("new").collectAsList().stream().map(r->r.getAs(0)).collect(Collectors.toList()); - List expected = Arrays.asList("-10", "-10", "0", "35", "82.2"); - assertTrue(newCol.containsAll(expected)); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void accumMixedStringsQueryTest() { - streamingTestUtil.performDPLTest( - "index=* | accum _raw", - numberDataWithMixedStringsTestFile, - ds -> { - List rawCol = ds.select("_raw").collectAsList().stream().map(r->r.getAs(0)).collect(Collectors.toList()); - // expect to skip strings in data and return original data as-is - List expected = Arrays.asList("10", "string", "110", "another_string", "165.0"); - assertTrue(rawCol.containsAll(expected)); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void accumMixedStringsQueryWithRenameFieldTest() { - streamingTestUtil.performDPLTest( - "index=* | accum _raw as new", - numberDataWithMixedStringsTestFile, - ds -> { - List newCol = ds.select("new").collectAsList().stream().map(r->r.getAs(0)).collect(Collectors.toList()); - // expect to skip strings in data and return empty - List expected = Arrays.asList("10", streamingTestUtil.getCtx().nullValue.value(), "110", streamingTestUtil.getCtx().nullValue.value(), "165.0"); - assertTrue(newCol.containsAll(expected)); - }); - } + private static final Logger LOGGER = LoggerFactory.getLogger(AccumTransformationStreamingTest.class); + + private final String numberDataTestFile = "src/test/resources/numberData_0*.json"; // * to make the path into a directory path + private final String numberDataWithMixedStringsTestFile = "src/test/resources/numberData_withMixedStrings*.json"; + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); + + private StreamingTestUtil streamingTestUtil; + + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(this.testSchema); + this.streamingTestUtil.setEnv(); + } + + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } + + // ---------------------------------------- + // Tests + // ---------------------------------------- + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void accumBasicQueryTest() { + streamingTestUtil.performDPLTest("index=* | accum _raw", numberDataTestFile, ds -> { + List rawCol = ds + .select("_raw") + .collectAsList() + .stream() + .map(r -> r.getAs(0)) + .collect(Collectors.toList()); + List expected = Arrays.asList("-10", "-10", "0", "35", "82.2"); + assertTrue(rawCol.containsAll(expected)); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void accumRenameFieldQueryTest() { + streamingTestUtil.performDPLTest("index=* | accum _raw as new", numberDataTestFile, ds -> { + List newCol = ds + .select("new") + .collectAsList() + .stream() + .map(r -> r.getAs(0)) + .collect(Collectors.toList()); + List expected = Arrays.asList("-10", "-10", "0", "35", "82.2"); + assertTrue(newCol.containsAll(expected)); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void accumMixedStringsQueryTest() { + streamingTestUtil.performDPLTest("index=* | accum _raw", numberDataWithMixedStringsTestFile, ds -> { + List rawCol = ds + .select("_raw") + .collectAsList() + .stream() + .map(r -> r.getAs(0)) + .collect(Collectors.toList()); + // expect to skip strings in data and return original data as-is + List expected = Arrays.asList("10", "string", "110", "another_string", "165.0"); + assertTrue(rawCol.containsAll(expected)); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void accumMixedStringsQueryWithRenameFieldTest() { + streamingTestUtil.performDPLTest("index=* | accum _raw as new", numberDataWithMixedStringsTestFile, ds -> { + List newCol = ds + .select("new") + .collectAsList() + .stream() + .map(r -> r.getAs(0)) + .collect(Collectors.toList()); + // expect to skip strings in data and return empty + List expected = Arrays + .asList( + "10", streamingTestUtil.getCtx().nullValue.value(), "110", + streamingTestUtil.getCtx().nullValue.value(), "165.0" + ); + assertTrue(newCol.containsAll(expected)); + }); + } } - - diff --git a/src/test/java/com/teragrep/pth10/AddtotalsTransformationTest.java b/src/test/java/com/teragrep/pth10/AddtotalsTransformationTest.java index 80b9407..82a8ea0 100644 --- a/src/test/java/com/teragrep/pth10/AddtotalsTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/AddtotalsTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -62,20 +62,19 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class AddtotalsTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(AddtotalsTransformationTest.class); private final String testFile = "src/test/resources/numberData_0*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -95,12 +94,20 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void addtotals_noparams_test() { streamingTestUtil.performDPLTest("index=* | addtotals ", testFile, ds -> { - List res = ds.select("Total").collectAsList().stream().map(r->r.getAs(0).toString()).sorted().collect(Collectors.toList()); + List res = ds + .select("Total") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .sorted() + .collect(Collectors.toList()); List expected = Arrays.asList("36.0", "11.0", "1.0", "-9.0", "48.2"); assertEquals(5, res.size()); assertEquals(5, expected.size()); @@ -114,11 +121,20 @@ void addtotals_noparams_test() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void addtotals_colParam_test() { streamingTestUtil.performDPLTest("index=* | addtotals col=true", testFile, ds -> { - List res = ds.select("_raw").collectAsList().stream().map(r->Double.parseDouble(r.getAs(0).toString())).sorted(Double::compareTo).collect(Collectors.toList()); - List expected = Arrays.asList(-10d,0d,10d,35d,47.2d,82.2d); + List res = ds + .select("_raw") + .collectAsList() + .stream() + .map(r -> Double.parseDouble(r.getAs(0).toString())) + .sorted(Double::compareTo) + .collect(Collectors.toList()); + List expected = Arrays.asList(-10d, 0d, 10d, 35d, 47.2d, 82.2d); assertEquals(6, res.size()); assertEquals(6, expected.size()); @@ -131,21 +147,21 @@ void addtotals_colParam_test() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void addtotals_fieldNames_test() { - streamingTestUtil.performDPLTest( - "index=* | addtotals col=true row=true labelfield=x1 fieldname=x2", - testFile, - ds -> { - List fieldsInData = Arrays.asList(ds.schema().fieldNames()); - // source schema + labelfield and fieldname - assertEquals(testSchema.length() + 2, fieldsInData.size()); - // check that fieldname and labelfield are present in schema - assertTrue(fieldsInData.contains("x1")); - assertTrue(fieldsInData.contains("x2")); - // 5 source rows plus last row for column sums - assertEquals(6, ds.count()); - } - ); + streamingTestUtil + .performDPLTest("index=* | addtotals col=true row=true labelfield=x1 fieldname=x2", testFile, ds -> { + List fieldsInData = Arrays.asList(ds.schema().fieldNames()); + // source schema + labelfield and fieldname + assertEquals(testSchema.length() + 2, fieldsInData.size()); + // check that fieldname and labelfield are present in schema + assertTrue(fieldsInData.contains("x1")); + assertTrue(fieldsInData.contains("x2")); + // 5 source rows plus last row for column sums + assertEquals(6, ds.count()); + }); } } diff --git a/src/test/java/com/teragrep/pth10/AggregateAfterSequentialCommandTest.java b/src/test/java/com/teragrep/pth10/AggregateAfterSequentialCommandTest.java index 332f692..5fa8529 100644 --- a/src/test/java/com/teragrep/pth10/AggregateAfterSequentialCommandTest.java +++ b/src/test/java/com/teragrep/pth10/AggregateAfterSequentialCommandTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -59,22 +59,21 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class AggregateAfterSequentialCommandTest { + private static final Logger LOGGER = LoggerFactory.getLogger(AggregateAfterSequentialCommandTest.class); private final String testFile = "src/test/resources/rexTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -99,60 +98,73 @@ void tearDown() { // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void aggregateAfterDedupTest() { - streamingTestUtil.performDPLTest( - "index=index_A | spath path=rainfall_rate | dedup rainfall_rate | stats sum(rainfall_rate)", - testFile, - ds -> { - assertEquals("139.875", ds.select("sum(rainfall_rate)").first().getString(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | spath path=rainfall_rate | dedup rainfall_rate | stats sum(rainfall_rate)", + testFile, ds -> { + assertEquals("139.875", ds.select("sum(rainfall_rate)").first().getString(0)); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void aggregateBeforeSeqModeAndAfter() { - streamingTestUtil.performDPLTest( - "index=index_A | spath path=rainfall_rate | stats count(rainfall_rate) as cr by _raw | dedup cr | stats sum(cr)", - testFile, - ds -> { - assertEquals("5", ds.select("sum(cr)").first().getString(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | spath path=rainfall_rate | stats count(rainfall_rate) as cr by _raw | dedup cr | stats sum(cr)", + testFile, ds -> { + assertEquals("5", ds.select("sum(cr)").first().getString(0)); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void aggregateAfterHdfsLoadTest() { - streamingTestUtil.performDPLTest( - "index=index_A | spath | teragrep exec hdfs save /tmp/pth_10/aggregateAfterHdfsLoadTest overwrite=true", - testFile, - ds -> { - assertEquals(new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, true, new MetadataBuilder().build()), + streamingTestUtil + .performDPLTest( + "index=index_A | spath | teragrep exec hdfs save /tmp/pth_10/aggregateAfterHdfsLoadTest overwrite=true", + testFile, ds -> { + assertEquals(new StructType(new StructField[] { + new StructField( + "_time", + DataTypes.TimestampType, + true, + new MetadataBuilder().build() + ), new StructField("id", DataTypes.LongType, true, new MetadataBuilder().build()), new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), new StructField("index", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField( + "sourcetype", + DataTypes.StringType, + true, + new MetadataBuilder().build() + ), new StructField("host", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, true, new MetadataBuilder().build()), - new StructField("atmosphere_water_vapor_content", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("atmosphere_cloud_liquid_water_content", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("rainfall_rate", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("latitude", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("wind_speed", DataTypes.StringType, true, new MetadataBuilder().build()) + new StructField("source", DataTypes.StringType, true, new MetadataBuilder().build()), new StructField("partition", DataTypes.StringType, true, new MetadataBuilder().build()), new StructField("offset", DataTypes.LongType, true, new MetadataBuilder().build()), new StructField("atmosphere_water_vapor_content", DataTypes.StringType, true, new MetadataBuilder().build()), new StructField("atmosphere_cloud_liquid_water_content", DataTypes.StringType, true, new MetadataBuilder().build()), new StructField("rainfall_rate", DataTypes.StringType, true, new MetadataBuilder().build()), new StructField("latitude", DataTypes.StringType, true, new MetadataBuilder().build()), new StructField("wind_speed", DataTypes.StringType, true, new MetadataBuilder().build()) }), ds.schema()); - }); + } + ); this.streamingTestUtil.setUp(); // reset for 2nd query - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load /tmp/pth_10/aggregateAfterHdfsLoadTest | dedup rainfall_rate | stats sum(rainfall_rate)", - testFile, - ds -> { - assertEquals("139.875", ds.select("sum(rainfall_rate)").first().getString(0)); - }); + streamingTestUtil + .performDPLTest( + "| teragrep exec hdfs load /tmp/pth_10/aggregateAfterHdfsLoadTest | dedup rainfall_rate | stats sum(rainfall_rate)", + testFile, ds -> { + assertEquals("139.875", ds.select("sum(rainfall_rate)").first().getString(0)); + } + ); } } - - diff --git a/src/test/java/com/teragrep/pth10/BloomFilterOperationsTest.java b/src/test/java/com/teragrep/pth10/BloomFilterOperationsTest.java index 75a078a..585f02e 100644 --- a/src/test/java/com/teragrep/pth10/BloomFilterOperationsTest.java +++ b/src/test/java/com/teragrep/pth10/BloomFilterOperationsTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,73 +63,77 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class BloomFilterOperationsTest { - private static final Logger LOGGER = LoggerFactory.getLogger(BloomFilterOperationsTest.class); - private final String testFile = "src/test/resources/xmlWalkerTestDataStreaming/bloomTeragrepStep_data*.json"; - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private static final Logger LOGGER = LoggerFactory.getLogger(BloomFilterOperationsTest.class); + private final String testFile = "src/test/resources/xmlWalkerTestDataStreaming/bloomTeragrepStep_data*.json"; - private StreamingTestUtil streamingTestUtil; + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); - @org.junit.jupiter.api.BeforeAll - void setEnv() { - streamingTestUtil = new StreamingTestUtil(this.testSchema); - streamingTestUtil.setEnv(); - /* - Class.forName ("org.h2.Driver"); - this.conn = DriverManager.getConnection("jdbc:h2:~/test;MODE=MariaDB;DATABASE_TO_LOWER=TRUE;CASE_INSENSITIVE_IDENTIFIERS=TRUE", "sa", ""); - org.h2.tools.RunScript.execute(conn, new FileReader("src/test/resources/bloomdb/bloomdb.sql")); - */ - } + private StreamingTestUtil streamingTestUtil; - @org.junit.jupiter.api.BeforeEach - void setUp() { - streamingTestUtil.setUp(); - /* - conn.prepareStatement("TRUNCATE TABLE filter_expected_100000_fpp_001").execute(); - conn.prepareStatement("TRUNCATE TABLE filter_expected_1000000_fpp_003").execute(); - conn.prepareStatement("TRUNCATE TABLE filter_expected_2500000_fpp_005").execute(); - */ - } + @org.junit.jupiter.api.BeforeAll + void setEnv() { + streamingTestUtil = new StreamingTestUtil(this.testSchema); + streamingTestUtil.setEnv(); + /* + Class.forName ("org.h2.Driver"); + this.conn = DriverManager.getConnection("jdbc:h2:~/test;MODE=MariaDB;DATABASE_TO_LOWER=TRUE;CASE_INSENSITIVE_IDENTIFIERS=TRUE", "sa", ""); + org.h2.tools.RunScript.execute(conn, new FileReader("src/test/resources/bloomdb/bloomdb.sql")); + */ + } - @org.junit.jupiter.api.AfterEach - void tearDown() { - streamingTestUtil.tearDown(); - } + @org.junit.jupiter.api.BeforeEach + void setUp() { + streamingTestUtil.setUp(); + /* + conn.prepareStatement("TRUNCATE TABLE filter_expected_100000_fpp_001").execute(); + conn.prepareStatement("TRUNCATE TABLE filter_expected_1000000_fpp_003").execute(); + conn.prepareStatement("TRUNCATE TABLE filter_expected_2500000_fpp_005").execute(); + */ + } - // ---------------------------------------- - // Tests - // ---------------------------------------- + @org.junit.jupiter.api.AfterEach + void tearDown() { + streamingTestUtil.tearDown(); + } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void estimateTest() { - streamingTestUtil.performDPLTest( - "index=index_A earliest=2020-01-01T00:00:00z latest=2023-01-01T00:00:00z | teragrep exec tokenizer | teragrep exec bloom estimate", - testFile, - ds -> { - assertEquals("[partition, estimate(tokens)]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - List results = ds.select("estimate(tokens)") - .collectAsList().stream() - .map(r -> Integer.parseInt(r.get(0).toString())) - .collect(Collectors.toList()); + // ---------------------------------------- + // Tests + // ---------------------------------------- - assertEquals(results.get(0), 1); - assertTrue(results.get(1) > 1); - } - ); - } -} + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void estimateTest() { + streamingTestUtil + .performDPLTest( + "index=index_A earliest=2020-01-01T00:00:00z latest=2023-01-01T00:00:00z | teragrep exec tokenizer | teragrep exec bloom estimate", + testFile, ds -> { + assertEquals( + "[partition, estimate(tokens)]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + List results = ds + .select("estimate(tokens)") + .collectAsList() + .stream() + .map(r -> Integer.parseInt(r.get(0).toString())) + .collect(Collectors.toList()); + assertEquals(results.get(0), 1); + assertTrue(results.get(1) > 1); + } + ); + } +} diff --git a/src/test/java/com/teragrep/pth10/CatalystVisitorTest.java b/src/test/java/com/teragrep/pth10/CatalystVisitorTest.java index cebee16..3f80dd0 100644 --- a/src/test/java/com/teragrep/pth10/CatalystVisitorTest.java +++ b/src/test/java/com/teragrep/pth10/CatalystVisitorTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -71,6 +71,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class CatalystVisitorTest { + private static final Logger LOGGER = LoggerFactory.getLogger(CatalystVisitorTest.class); // Use this file for dataset initialization @@ -94,7 +95,10 @@ void tearDown() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void fromStringNot2Test() { String q = "index = \"cpu\" AND sourcetype = \"log:cpu:0\" NOT src"; @@ -130,7 +134,10 @@ void columnFromStringTest() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void columnFromStringDateTest() { // Add time ranges String q = "((( index =\"cpu\" AND host = \"sc-99-99-14-25\" ) AND sourcetype = \"log:cpu:0\" ) AND ( earliest= \"01/01/1970:02:00:00\" AND latest= \"01/01/2030:00:00:00\" ))"; @@ -138,7 +145,9 @@ void columnFromStringDateTest() { try { long earliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("01/01/1970:02:00:00"); long latestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("01/01/2030:00:00:00"); - String e = "(((RLIKE(index, (?i)^cpu$) AND RLIKE(host, (?i)^sc-99-99-14-25)) AND RLIKE(sourcetype, (?i)^log:cpu:0)) AND ((_time >= from_unixtime(" + earliestEpoch + ", yyyy-MM-dd HH:mm:ss)) AND (_time < from_unixtime("+ latestEpoch + ", yyyy-MM-dd HH:mm:ss))))"; + String e = "(((RLIKE(index, (?i)^cpu$) AND RLIKE(host, (?i)^sc-99-99-14-25)) AND RLIKE(sourcetype, (?i)^log:cpu:0)) AND ((_time >= from_unixtime(" + + earliestEpoch + ", yyyy-MM-dd HH:mm:ss)) AND (_time < from_unixtime(" + latestEpoch + + ", yyyy-MM-dd HH:mm:ss))))"; DPLParserCatalystContext ctx = this.streamingTestUtil.getCtx(); String result = ctx.getSparkQuery(); @@ -146,14 +155,15 @@ void columnFromStringDateTest() { LOGGER.info("Expected=" + e); LOGGER.info("Result=" + result); assertEquals(e, result); - } catch (ParseException e) { + } + catch (ParseException e) { fail(e.getMessage()); } }); } @Disabled - // FIXME * to % + // FIXME * to % void columnFromStringDate1Test() { String q, e; Column result; @@ -165,24 +175,30 @@ void columnFromStringDate1Test() { DPLParserCatalystContext ctx = this.streamingTestUtil.getCtx(); - e = "((((NOT `index` LIKE 'strawberry') AND `sourcetype` LIKE 'example:strawberry:strawberry') AND `host` LIKE 'loadbalancer.example.com') OR ((((`index` LIKE '*' AND `host` LIKE 'firewall.example.com') AND (`_time` >= from_unixtime("+zero+", 'yyyy-MM-dd HH:mm:ss'))) AND (`_time` <= from_unixtime("+epoch+", 'yyyy-MM-dd HH:mm:ss'))) AND `_raw` RLIKE '(?i)^.*\\\\QDenied\\\\E.*'))"; - ctx.setEarliest("-1Y"); - DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); - CharStream inputStream = CharStreams.fromString(q); - DPLLexer lexer = new DPLLexer(inputStream); - DPLParser parser = new DPLParser(new CommonTokenStream(lexer)); - ParseTree tree = parser.root(); - LOGGER.debug(tree.toStringTree(parser)); - Object n = visitor.visit(tree); - result = visitor.getLogicalPartAsColumn(); + e = "((((NOT `index` LIKE 'strawberry') AND `sourcetype` LIKE 'example:strawberry:strawberry') AND `host` LIKE 'loadbalancer.example.com') OR ((((`index` LIKE '*' AND `host` LIKE 'firewall.example.com') AND (`_time` >= from_unixtime(" + + zero + ", 'yyyy-MM-dd HH:mm:ss'))) AND (`_time` <= from_unixtime(" + epoch + + ", 'yyyy-MM-dd HH:mm:ss'))) AND `_raw` RLIKE '(?i)^.*\\\\QDenied\\\\E.*'))"; + ctx.setEarliest("-1Y"); + DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); + CharStream inputStream = CharStreams.fromString(q); + DPLLexer lexer = new DPLLexer(inputStream); + DPLParser parser = new DPLParser(new CommonTokenStream(lexer)); + ParseTree tree = parser.root(); + LOGGER.debug(tree.toStringTree(parser)); + Object n = visitor.visit(tree); + result = visitor.getLogicalPartAsColumn(); assertEquals(e, result.expr().sql()); - } catch (ParseException exception) { + } + catch (ParseException exception) { fail(exception.getMessage()); } } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void columnFromStringAndTest() { //LOGGER.info("------ AND ---------"); String q = "index =\"strawberry\" AND sourcetype =\"example:strawberry:strawberry\""; @@ -196,7 +212,10 @@ void columnFromStringAndTest() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void columnFromStringOrTest() { //LOGGER.info("------ OR ---------"); String q = "index != \"strawberry\" OR sourcetype =\"example:strawberry:strawberry\""; @@ -221,8 +240,10 @@ void filterTest() { try { long earliest = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/26/2021:07:00:00"); long latest = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/26/2021:08:00:00"); -// e = "((((NOT `index` LIKE 'strawberry') AND `sourcetype` LIKE 'example:strawberry:strawberry') AND `host` LIKE 'loadbalancer.example.com') OR ((((`index` LIKE '*' AND `host` LIKE 'firewall.example.com') AND (`_time` >= from_unixtime("+earliest+", 'yyyy-MM-dd HH:mm:ss'))) AND (`_time` <= from_unixtime("+latest+", 'yyyy-MM-dd HH:mm:ss'))) AND `_raw` LIKE '%Denied%'))"; - e = "((((NOT `index` LIKE 'strawberry') AND `sourcetype` LIKE 'example:strawberry:strawberry') AND `host` LIKE 'loadbalancer.example.com') OR ((((`index` LIKE '*' AND `host` LIKE 'firewall.example.com') AND (`_time` >= from_unixtime("+earliest+", 'yyyy-MM-dd HH:mm:ss'))) AND (`_time` <= from_unixtime("+latest+", 'yyyy-MM-dd HH:mm:ss'))) AND `_raw` RLIKE '(?i)^.*\\\\QDenied\\\\E.*'))"; + // e = "((((NOT `index` LIKE 'strawberry') AND `sourcetype` LIKE 'example:strawberry:strawberry') AND `host` LIKE 'loadbalancer.example.com') OR ((((`index` LIKE '*' AND `host` LIKE 'firewall.example.com') AND (`_time` >= from_unixtime("+earliest+", 'yyyy-MM-dd HH:mm:ss'))) AND (`_time` <= from_unixtime("+latest+", 'yyyy-MM-dd HH:mm:ss'))) AND `_raw` LIKE '%Denied%'))"; + e = "((((NOT `index` LIKE 'strawberry') AND `sourcetype` LIKE 'example:strawberry:strawberry') AND `host` LIKE 'loadbalancer.example.com') OR ((((`index` LIKE '*' AND `host` LIKE 'firewall.example.com') AND (`_time` >= from_unixtime(" + + earliest + ", 'yyyy-MM-dd HH:mm:ss'))) AND (`_time` <= from_unixtime(" + latest + + ", 'yyyy-MM-dd HH:mm:ss'))) AND `_raw` RLIKE '(?i)^.*\\\\QDenied\\\\E.*'))"; DPLParserCatalystContext ctx = this.streamingTestUtil.getCtx(); ctx.setEarliest("-1Y"); @@ -234,20 +255,25 @@ void filterTest() { LOGGER.debug(tree.toStringTree(parser)); CatalystNode n = (CatalystNode) visitor.visit(tree); result = visitor.getLogicalPartAsColumn(); - assertEquals(e, result.expr().sql());} catch (ParseException exception) { + assertEquals(e, result.expr().sql()); + } + catch (ParseException exception) { fail(exception.getMessage()); } } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void fromStringFullTest() { String q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as count by _time | where count > 70"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { DPLTimeFormat tf = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss"); long earliest = Assertions.assertDoesNotThrow(() -> tf.getEpoch("04/16/2020:10:25:40")); - String e = "(RLIKE(index, (?i)^cinnamon$) AND (_time >= from_unixtime(" + earliest + ", yyyy-MM-dd HH:mm:ss)))"; + String e = "(RLIKE(index, (?i)^cinnamon$) AND (_time >= from_unixtime(" + earliest + + ", yyyy-MM-dd HH:mm:ss)))"; DPLParserCatalystContext ctx = this.streamingTestUtil.getCtx(); String result = ctx.getSparkQuery(); @@ -256,50 +282,60 @@ void fromStringFullTest() { }); } - // index = cinnamon _index_earliest="04/16/2020:10:25:40" @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void endToEndTest() { - this.streamingTestUtil.performDPLTest("index = cinnamon _index_earliest=\"04/16/2020:10:25:40\"", this.testFile, res -> { - String e = "[_raw: string, _time: string ... 6 more fields]"; - // check schema - assertEquals(e, res.toString()); - - String logicalPart = this.streamingTestUtil.getCtx().getSparkQuery(); - // check column for archive query i.e. only logical part' - DPLTimeFormat tf = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss"); - long indexEarliestEpoch = Assertions.assertDoesNotThrow(() -> tf.getEpoch("04/16/2020:10:25:40")); - e = "(RLIKE(index, (?i)^cinnamon$) AND (_time >= from_unixtime(" + indexEarliestEpoch + ", yyyy-MM-dd HH:mm:ss)))"; - assertEquals(e, logicalPart); - }); + this.streamingTestUtil + .performDPLTest("index = cinnamon _index_earliest=\"04/16/2020:10:25:40\"", this.testFile, res -> { + String e = "[_raw: string, _time: string ... 6 more fields]"; + // check schema + assertEquals(e, res.toString()); + + String logicalPart = this.streamingTestUtil.getCtx().getSparkQuery(); + // check column for archive query i.e. only logical part' + DPLTimeFormat tf = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss"); + long indexEarliestEpoch = Assertions.assertDoesNotThrow(() -> tf.getEpoch("04/16/2020:10:25:40")); + e = "(RLIKE(index, (?i)^cinnamon$) AND (_time >= from_unixtime(" + indexEarliestEpoch + + ", yyyy-MM-dd HH:mm:ss)))"; + assertEquals(e, logicalPart); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void endToEnd2Test() { // Use this file as the test data String testFile = "src/test/resources/subsearchData*.json"; this.streamingTestUtil.performDPLTest("index=index_A \"(1)(enTIty)\"", testFile, res -> { String e = "StructType(StructField(_raw,StringType,true),StructField(_time,StringType,true),StructField(host,StringType,true),StructField(index,StringType,true),StructField(offset,LongType,true),StructField(origin,StringType,true),StructField(partition,StringType,true),StructField(source,StringType,true),StructField(sourcetype,StringType,true))"; - String resSchema=res.schema().toString(); + String resSchema = res.schema().toString(); assertEquals(e, resSchema); // Check result count List lst = res.collectAsList(); // check result count - assertEquals(1,lst.size()); + assertEquals(1, lst.size()); // get logical part String logicalPart = this.streamingTestUtil.getCtx().getSparkQuery(); - e="(RLIKE(index, (?i)^index_A$) AND RLIKE(_raw, (?i)^.*\\Q(1)(enTIty)\\E.*))"; + e = "(RLIKE(index, (?i)^index_A$) AND RLIKE(_raw, (?i)^.*\\Q(1)(enTIty)\\E.*))"; assertEquals(e, logicalPart); }); } // Check that is AggregatesUsed returns false @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void endToEnd6Test() { this.streamingTestUtil.performDPLTest("index = jla02logger ", this.testFile, res -> { boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); @@ -307,19 +343,26 @@ void endToEnd6Test() { }); } - // Check that issue#179 returns user friendly error message @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void searchQualifierMissingRightSide_Issue179_Test() { // assert user-friendly exception - RuntimeException thrown = - this.streamingTestUtil.performThrowingDPLTest(RuntimeException.class, "index = ", this.testFile, res -> {}); - - Assertions.assertEquals("The right side of the search qualifier was empty! Check that the index has a valid value, like 'index = cinnamon'.", thrown.getMessage()); + RuntimeException thrown = this.streamingTestUtil + .performThrowingDPLTest(RuntimeException.class, "index = ", this.testFile, res -> { + }); + + Assertions + .assertEquals( + "The right side of the search qualifier was empty! Check that the index has a valid value, like 'index = cinnamon'.", + thrown.getMessage() + ); } - @Disabled(value="Disabled because the test needs to be converted to a dataframe test") + @Disabled(value = "Disabled because the test needs to be converted to a dataframe test") @Test // disabled on 2022-05-16 TODO convert to dataframe test public void parseWhereXmlTest() { String q, e; @@ -328,33 +371,38 @@ public void parseWhereXmlTest() { DPLTimeFormat tf = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss"); long earliest = Assertions.assertDoesNotThrow(() -> tf.getEpoch("04/16/2020:10:25:40")); - e = ""; + e = ""; String xml = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); assertEquals(e, xml); } - @Disabled(value="Disabled because the test needs to be converted to a dataframe test") + @Disabled(value = "Disabled because the test needs to be converted to a dataframe test") @Test // disabled on 2022-05-16 TODO convert to dataframe test public void parseWhereXml1Test() { String q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as count by _time | where count > 70 AND count < 75"; DPLTimeFormat tf = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss"); long earliest = Assertions.assertDoesNotThrow(() -> tf.getEpoch("04/16/2020:10:25:40")); - String e = ""; + String e = ""; String xml = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); assertEquals(e, xml); } - @Disabled(value="Disabled because the test needs to be converted to a dataframe test") + @Disabled(value = "Disabled because the test needs to be converted to a dataframe test") @Test // disabled on 2022-05-16 TODO convert to dataframe test public void parseWhereXml2Test() { String q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as count by _time | where count > 70 AND count < 75 | where count = 72"; DPLTimeFormat tf = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss"); long earliest = Assertions.assertDoesNotThrow(() -> tf.getEpoch("04/16/2020:10:25:40")); - String e = ""; + String e = ""; String xml = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); assertEquals(e, xml); } } - diff --git a/src/test/java/com/teragrep/pth10/ConvertTransformationTest.java b/src/test/java/com/teragrep/pth10/ConvertTransformationTest.java index e82ad9f..b6b56ac 100644 --- a/src/test/java/com/teragrep/pth10/ConvertTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/ConvertTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,487 +63,670 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class ConvertTransformationTest { - private static final Logger LOGGER = LoggerFactory.getLogger(ConvertTransformationTest.class); - - // Use this file for dataset initialization - String testFile = "src/test/resources/convertTfData*.json"; // * to make the path into a directory path - - private StreamingTestUtil streamingTestUtil; - - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(); - this.streamingTestUtil.setEnv(); - } - - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } - - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert1_ctime() { - // "%m/%d/%Y %H:%M:%S"; - streamingTestUtil.performDPLTest( - "index=index_A | convert ctime(offset) AS new", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, new]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("new").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - for (String s : listOfResults) { - // match 00/00/0000 00:00:00 - Matcher m = Pattern.compile("\\d{2}/\\d{2}/\\d{4} \\d{2}:\\d{2}:\\d{2}").matcher(s); - - assertTrue(m.find()); - } - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert2_ctime() { - streamingTestUtil.performDPLTest( - "index=index_A | convert ctime(offset)", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedResults = Arrays.asList( - "01/01/1970 00:00:11", - "01/01/1970 00:00:11", - "01/01/1970 00:00:10", - "01/01/1970 00:00:09", - "01/01/1970 00:00:08", - "01/01/1970 00:00:07", - "01/01/1970 00:00:06", - "01/01/1970 00:00:05", - "01/01/1970 00:00:04", - "01/01/1970 00:00:03", - "01/01/1970 00:00:02", - "01/01/1970 00:00:01"); - - for (int i = 0; i < listOfResults.size(); i++) { - assertEquals(expectedResults.get(i), listOfResults.get(i)); - } - } - ); + + private static final Logger LOGGER = LoggerFactory.getLogger(ConvertTransformationTest.class); + + // Use this file for dataset initialization + String testFile = "src/test/resources/convertTfData*.json"; // * to make the path into a directory path + + private StreamingTestUtil streamingTestUtil; + + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(); + this.streamingTestUtil.setEnv(); + } + + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); } @Test - @DisabledIfSystemProperty(named="runSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert1_ctime() { + // "%m/%d/%Y %H:%M:%S"; + streamingTestUtil.performDPLTest("index=index_A | convert ctime(offset) AS new", testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, new]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("new") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + for (String s : listOfResults) { + // match 00/00/0000 00:00:00 + Matcher m = Pattern.compile("\\d{2}/\\d{2}/\\d{4} \\d{2}:\\d{2}:\\d{2}").matcher(s); + + assertTrue(m.find()); + } + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert2_ctime() { + streamingTestUtil.performDPLTest("index=index_A | convert ctime(offset)", testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedResults = Arrays + .asList( + "01/01/1970 00:00:11", "01/01/1970 00:00:11", "01/01/1970 00:00:10", "01/01/1970 00:00:09", + "01/01/1970 00:00:08", "01/01/1970 00:00:07", "01/01/1970 00:00:06", "01/01/1970 00:00:05", + "01/01/1970 00:00:04", "01/01/1970 00:00:03", "01/01/1970 00:00:02", "01/01/1970 00:00:01" + ); + + for (int i = 0; i < listOfResults.size(); i++) { + assertEquals(expectedResults.get(i), listOfResults.get(i)); + } + }); + } + + @Test + @DisabledIfSystemProperty( + named = "runSparkTest", + matches = "true" + ) void convert3_mktime() { - streamingTestUtil.performDPLTest( - "index=index_A | convert timeformat=\"%Y-%m-%d'T'%H:%M:%S.%f%z\" mktime(_time) as epochTime", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, epochTime]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("epochTime").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - // rows get sorted by timestamp, so the order differs from original test data - List expectedResults = Arrays.asList( - "1286698810", - "1286698210", - "1286694610", - "1252476549", - "1218172088", - "1183781227", - "1149563166", - "1115258705", - "1081040644", - // Below epochs are winter months, but still +0300 (differs from local finnish time) - "1046649783", - "1012604522", - "978300061" - ); - - assertEquals(expectedResults, listOfResults); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | convert timeformat=\"%Y-%m-%d'T'%H:%M:%S.%f%z\" mktime(_time) as epochTime", + testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, epochTime]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("epochTime") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + // rows get sorted by timestamp, so the order differs from original test data + List expectedResults = Arrays + .asList( + "1286698810", "1286698210", "1286694610", "1252476549", "1218172088", + "1183781227", "1149563166", "1115258705", "1081040644", + // Below epochs are winter months, but still +0300 (differs from local finnish time) + "1046649783", "1012604522", "978300061" + ); + + assertEquals(expectedResults, listOfResults); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert3_mktime2() { // Use the system default timezone when timezone is not specified + streamingTestUtil + .performDPLTest( + "index=index_A | eval a=\"2001-01-01T01:01:01.010\" | convert timeformat=\"%Y-%m-%d'T'%H:%M:%S.%f\" mktime(a) as epochTime", + testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, a, epochTime]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("epochTime") + .distinct() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedResults = Collections + .singletonList( + "978303661" // +0300 timezone + ); + + assertEquals(expectedResults, listOfResults); + } + ); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert3_mktime2() { // Use the system default timezone when timezone is not specified - streamingTestUtil.performDPLTest( - "index=index_A | eval a=\"2001-01-01T01:01:01.010\" | convert timeformat=\"%Y-%m-%d'T'%H:%M:%S.%f\" mktime(a) as epochTime", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, a, epochTime]", Arrays.toString(ds.columns())); + @Test + @DisabledIfSystemProperty( + named = "runSparkTest", + matches = "true" + ) + void convert4_dur2sec() { + streamingTestUtil.performDPLTest("index=index_A | convert dur2sec(dur) as dur_sec", testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, dur_sec]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("dur_sec") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + // rows get sorted by timestamp, so the order differs from original test data + List expectedResults = Arrays + .asList( + "5430", "0", "195792", "3600", "3661", "2400", "3723", "22", "7432", "1403", "24202", + "45296" + ); + + assertEquals(expectedResults, listOfResults); + }); - List listOfResults = ds.select("epochTime").distinct().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedResults = Collections.singletonList( - "978303661" // +0300 timezone - ); + } - assertEquals(expectedResults, listOfResults); - } - ); + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert5_memk() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat offset \"m\" offsetM | strcat offset \"k\" offsetK | strcat offset \"g\" offsetG | convert memk(offsetM) as memk_M memk(offsetK) as memk_K memk(offsetG) as memk_G memk(offset) as memk_def", + testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, offsetM, offsetK, offsetG, memk_M, memk_K, memk_G, memk_def]", + Arrays.toString(ds.columns()) + ); + + List resDef = ds + .select("memk_def") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List resK = ds + .select("memk_K") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List resM = ds + .select("memk_M") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List resG = ds + .select("memk_G") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + // rows get sorted by timestamp, so the order differs from original test data + List expDef = Arrays + .asList( + "11.0", "11.0", "10.0", "9.0", "8.0", "7.0", "6.0", "5.0", "4.0", "3.0", + "2.0", "1.0" + ); + List expM = Arrays + .asList( + "11264.0", "11264.0", "10240.0", "9216.0", "8192.0", "7168.0", "6144.0", + "5120.0", "4096.0", "3072.0", "2048.0", "1024.0" + ); + List expG = Arrays + .asList( + "1.1534336E7", "1.1534336E7", "1.048576E7", "9437184.0", "8388608.0", + "7340032.0", "6291456.0", "5242880.0", "4194304.0", "3145728.0", + "2097152.0", "1048576.0" + ); + + assertEquals(expDef, resDef); + assertEquals(expDef, resK); // def is same as K + assertEquals(expM, resM); + assertEquals(expG, resG); + } + ); + } + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert6_mstime() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"\" \"47.\" \"329\" mst | strcat \"32:\" \"47.\" \"329\" mst2 | convert mstime(mst) as res mstime(mst2) as res2", + testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, mst2, res, res2]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List listOfResults2 = ds + .select("res2") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedResults = Collections.singletonList("47329"); + List expectedResults2 = Collections.singletonList("1967329"); + + assertEquals(expectedResults, listOfResults); + assertEquals(expectedResults2, listOfResults2); + } + ); + } - } + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert7_rmcomma() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"\" \"47,\" \"329\" mst | strcat \"32,\" \"47,\" \"329\" mst2 | convert rmcomma(mst) as res rmcomma(mst2) as res2", + testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, mst2, res, res2]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List listOfResults2 = ds + .select("res2") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedResults = Collections.singletonList("47329"); + List expectedResults2 = Collections.singletonList("3247329"); + + assertEquals(expectedResults, listOfResults); + assertEquals(expectedResults2, listOfResults2); + } + ); + } @Test - @DisabledIfSystemProperty(named="runSparkTest", matches="true") - void convert4_dur2sec() { - streamingTestUtil.performDPLTest( - "index=index_A | convert dur2sec(dur) as dur_sec", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, dur_sec]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("dur_sec").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - // rows get sorted by timestamp, so the order differs from original test data - List expectedResults = Arrays.asList( - "5430", - "0", - "195792", - "3600", - "3661", - "2400", - "3723", - "22", - "7432", - "1403", - "24202", - "45296"); - - assertEquals(expectedResults, listOfResults); - } - ); - - - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert5_memk() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat offset \"m\" offsetM | strcat offset \"k\" offsetK | strcat offset \"g\" offsetG | convert memk(offsetM) as memk_M memk(offsetK) as memk_K memk(offsetG) as memk_G memk(offset) as memk_def", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, offsetM, offsetK, offsetG, memk_M, memk_K, memk_G, memk_def]", Arrays.toString(ds.columns())); - - List resDef = ds.select("memk_def").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List resK = ds.select("memk_K").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List resM = ds.select("memk_M").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List resG = ds.select("memk_G").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - // rows get sorted by timestamp, so the order differs from original test data - List expDef = Arrays.asList( - "11.0", - "11.0", - "10.0", - "9.0", - "8.0", - "7.0", - "6.0", - "5.0", - "4.0", - "3.0", - "2.0", - "1.0"); - List expM = Arrays.asList( - "11264.0", - "11264.0", - "10240.0", - "9216.0", - "8192.0", - "7168.0", - "6144.0", - "5120.0", - "4096.0", - "3072.0", - "2048.0", - "1024.0"); - List expG = Arrays.asList( - "1.1534336E7", - "1.1534336E7", - "1.048576E7", - "9437184.0", - "8388608.0", - "7340032.0", - "6291456.0", - "5242880.0", - "4194304.0", - "3145728.0", - "2097152.0", - "1048576.0"); - - assertEquals(expDef, resDef); - assertEquals(expDef, resK); // def is same as K - assertEquals(expM, resM); - assertEquals(expG, resG); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert6_mstime() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"\" \"47.\" \"329\" mst | strcat \"32:\" \"47.\" \"329\" mst2 | convert mstime(mst) as res mstime(mst2) as res2", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, mst2, res, res2]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List listOfResults2 = ds.select("res2").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedResults = Collections.singletonList( - "47329"); - List expectedResults2 = Collections.singletonList( - "1967329"); - - assertEquals(expectedResults, listOfResults); - assertEquals(expectedResults2, listOfResults2); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert7_rmcomma() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"\" \"47,\" \"329\" mst | strcat \"32,\" \"47,\" \"329\" mst2 | convert rmcomma(mst) as res rmcomma(mst2) as res2", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, mst2, res, res2]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List listOfResults2 = ds.select("res2").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedResults = Collections.singletonList( - "47329"); - List expectedResults2 = Collections.singletonList( - "3247329"); - - assertEquals(expectedResults, listOfResults); - assertEquals(expectedResults2, listOfResults2); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert8_rmunit() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"329\" \"abc\" as mst | convert rmunit(mst) as res", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - List expectedResults = Collections.singletonList( - "329"); - - assertEquals(expectedResults, listOfResults); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert8_rmunit2() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"329.45\" \"abc\" as mst | convert rmunit(mst) as res", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - List expectedResults = Collections.singletonList( - "329.45"); - - assertEquals(expectedResults, listOfResults); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert8_rmunit3() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \".54e2\" \"abc\" as mst | convert rmunit(mst) as res", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - List expectedResults = Collections.singletonList( - ".54E2"); - - assertEquals(expectedResults, listOfResults); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert8_rmunit4() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"-0.54e2\" \"abc\" as mst | convert rmunit(mst) as res", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedResults = Collections.singletonList("-0.54E2"); - - assertEquals(expectedResults, listOfResults); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert8_rmunit5() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"-0.21.54e2\" \"abc\" as mst | convert rmunit(mst) as res", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedResults = Collections.singletonList(""); - - assertEquals(expectedResults, listOfResults); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert8_rmunit6() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"+21.54e23\" \"abc\" as mst | convert rmunit(mst) as res", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedResults = Collections.singletonList("+21.54E23"); - - assertEquals(expectedResults, listOfResults); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert8_rmunit7() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"+21.54e-23\" \"abc\" as mst | convert rmunit(mst) as res", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - List expectedResults = Collections.singletonList("+21.54E-23"); - - assertEquals(expectedResults, listOfResults); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert8_rmunit8() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"+21.54e+23\" \"abc\" as mst | convert rmunit(mst) as res", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", Arrays.toString(ds.columns())); - - List listOfResults = ds.select("res").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedResults = Collections.singletonList("+21.54E+23"); - - assertEquals(expectedResults, listOfResults); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert9_auto() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"329\" \"\" with_results |strcat \"329\" \"aa\" no_results | convert auto(with_results) | convert auto(no_results)", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, with_results, no_results]", Arrays.toString(ds.columns())); - List listOfResults = ds.select("with_results").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List listOfResults2 = ds.select("no_results").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - List expectedResults = Collections.singletonList( - "329.0"); - - List expectedResults2 = Collections.singletonList( - "329aa"); - - assertEquals(expectedResults, listOfResults); - assertEquals(expectedResults2, listOfResults2); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert10_num() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat \"329\" \"\" with_results |strcat \"329\" \"aa\" no_results | convert num(with_results) | convert num(no_results)", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, with_results, no_results]", Arrays.toString(ds.columns())); - List listOfResults = ds.select("with_results").limit(1).collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List listOfResults2 = ds.select("no_results").limit(1).collectAsList().stream().map(r -> r.getAs(0) == null ? "null" : r.getAs(0).toString()).collect(Collectors.toList()); - - List expectedResults = Collections.singletonList("329.0"); - - List expectedResults2 = Collections.singletonList("null"); - - assertEquals(expectedResults, listOfResults); - assertEquals(expectedResults2, listOfResults2); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void convert11_none() { - streamingTestUtil.performDPLTest( - "index=index_A | convert dur2sec(\"dur|offset\") AS dur_sec none(offset)", - testFile, - ds -> { - assertEquals("[_raw, _time, dur, host, index, offset, partition, source, sourcetype, dur_sec]", Arrays.toString(ds.columns())); - List listOfResults = ds.select("dur_sec").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - // rows get sorted by timestamp, so the order differs from original test data - List expectedResults = Arrays.asList( - "5430", - "0", - "195792", - "3600", - "3661", - "2400", - "3723", - "22", - "7432", - "1403", - "24202", - "45296"); - - assertEquals(expectedResults, listOfResults); - } - ); - } + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert8_rmunit() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"329\" \"abc\" as mst | convert rmunit(mst) as res", testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + List expectedResults = Collections.singletonList("329"); + + assertEquals(expectedResults, listOfResults); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert8_rmunit2() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"329.45\" \"abc\" as mst | convert rmunit(mst) as res", testFile, + ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + List expectedResults = Collections.singletonList("329.45"); + + assertEquals(expectedResults, listOfResults); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert8_rmunit3() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \".54e2\" \"abc\" as mst | convert rmunit(mst) as res", testFile, + ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + List expectedResults = Collections.singletonList(".54E2"); + + assertEquals(expectedResults, listOfResults); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert8_rmunit4() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"-0.54e2\" \"abc\" as mst | convert rmunit(mst) as res", testFile, + ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedResults = Collections.singletonList("-0.54E2"); + + assertEquals(expectedResults, listOfResults); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert8_rmunit5() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"-0.21.54e2\" \"abc\" as mst | convert rmunit(mst) as res", testFile, + ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedResults = Collections.singletonList(""); + + assertEquals(expectedResults, listOfResults); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert8_rmunit6() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"+21.54e23\" \"abc\" as mst | convert rmunit(mst) as res", testFile, + ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedResults = Collections.singletonList("+21.54E23"); + + assertEquals(expectedResults, listOfResults); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert8_rmunit7() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"+21.54e-23\" \"abc\" as mst | convert rmunit(mst) as res", testFile, + ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + List expectedResults = Collections.singletonList("+21.54E-23"); + + assertEquals(expectedResults, listOfResults); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert8_rmunit8() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"+21.54e+23\" \"abc\" as mst | convert rmunit(mst) as res", testFile, + ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, mst, res]", + Arrays.toString(ds.columns()) + ); + + List listOfResults = ds + .select("res") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedResults = Collections.singletonList("+21.54E+23"); + + assertEquals(expectedResults, listOfResults); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert9_auto() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"329\" \"\" with_results |strcat \"329\" \"aa\" no_results | convert auto(with_results) | convert auto(no_results)", + testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, with_results, no_results]", + Arrays.toString(ds.columns()) + ); + List listOfResults = ds + .select("with_results") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List listOfResults2 = ds + .select("no_results") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + List expectedResults = Collections.singletonList("329.0"); + + List expectedResults2 = Collections.singletonList("329aa"); + + assertEquals(expectedResults, listOfResults); + assertEquals(expectedResults2, listOfResults2); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert10_num() { + streamingTestUtil + .performDPLTest( + "index=index_A | strcat \"329\" \"\" with_results |strcat \"329\" \"aa\" no_results | convert num(with_results) | convert num(no_results)", + testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, with_results, no_results]", + Arrays.toString(ds.columns()) + ); + List listOfResults = ds + .select("with_results") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List listOfResults2 = ds + .select("no_results") + .limit(1) + .collectAsList() + .stream() + .map(r -> r.getAs(0) == null ? "null" : r.getAs(0).toString()) + .collect(Collectors.toList()); + + List expectedResults = Collections.singletonList("329.0"); + + List expectedResults2 = Collections.singletonList("null"); + + assertEquals(expectedResults, listOfResults); + assertEquals(expectedResults2, listOfResults2); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void convert11_none() { + streamingTestUtil + .performDPLTest( + "index=index_A | convert dur2sec(\"dur|offset\") AS dur_sec none(offset)", testFile, ds -> { + assertEquals( + "[_raw, _time, dur, host, index, offset, partition, source, sourcetype, dur_sec]", + Arrays.toString(ds.columns()) + ); + List listOfResults = ds + .select("dur_sec") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + // rows get sorted by timestamp, so the order differs from original test data + List expectedResults = Arrays + .asList( + "5430", "0", "195792", "3600", "3661", "2400", "3723", "22", "7432", "1403", + "24202", "45296" + ); + + assertEquals(expectedResults, listOfResults); + } + ); + } } diff --git a/src/test/java/com/teragrep/pth10/DPLTimeFormatTest.java b/src/test/java/com/teragrep/pth10/DPLTimeFormatTest.java index d9b4f97..37514b9 100644 --- a/src/test/java/com/teragrep/pth10/DPLTimeFormatTest.java +++ b/src/test/java/com/teragrep/pth10/DPLTimeFormatTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10; import com.teragrep.pth10.ast.DPLTimeFormat; diff --git a/src/test/java/com/teragrep/pth10/DedupTransformationTest.java b/src/test/java/com/teragrep/pth10/DedupTransformationTest.java index f19d4b1..592a232 100644 --- a/src/test/java/com/teragrep/pth10/DedupTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/DedupTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -57,13 +57,13 @@ import static org.junit.jupiter.api.Assertions.*; /** - * Tests for the new ProcessingStack implementation - * Uses streaming datasets + * Tests for the new ProcessingStack implementation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class DedupTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(DedupTransformationTest.class); // Use this file for dataset initialization @@ -87,13 +87,15 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // basic dedup + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // basic dedup public void dedupTest_NoParams() { this.streamingTestUtil.performDPLTest("index=index_A | dedup _raw", this.testFile, res -> { List expectedColumns = new ArrayList<>( @@ -115,7 +117,10 @@ public void dedupTest_NoParams() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // consecutive=true + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // consecutive=true public void dedupTest_Consecutive() { String query = "index=index_A | dedup _raw consecutive= true"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { @@ -137,7 +142,10 @@ public void dedupTest_Consecutive() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // sort descending as numbers + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // sort descending as numbers public void dedupTest_SortNum() { String query = "index=index_A | dedup _raw sortby - num(_raw)"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { @@ -156,7 +164,10 @@ public void dedupTest_SortNum() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // keep duplicate events with nulls + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // keep duplicate events with nulls public void dedupTest_KeepEvents() { String query = "index=index_A | dedup _raw keepevents= true"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { @@ -179,7 +190,10 @@ public void dedupTest_KeepEvents() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // keep null values + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // keep null values public void dedupTest_KeepEmpty() { // first use keepevents=true to make null values in the dataset String query = "index=index_A | dedup _raw keepevents= true | dedup _raw keepempty= true"; @@ -201,7 +215,10 @@ public void dedupTest_KeepEmpty() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // deduplicate based on _raw, sourcetype and partition + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // deduplicate based on _raw, sourcetype and partition public void dedupTest_MultiColumn() { String query = "index=index_A | dedup _raw, sourcetype, partition"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { diff --git a/src/test/java/com/teragrep/pth10/DefaultTimeFormatTest.java b/src/test/java/com/teragrep/pth10/DefaultTimeFormatTest.java index 12ff148..fa56c1d 100644 --- a/src/test/java/com/teragrep/pth10/DefaultTimeFormatTest.java +++ b/src/test/java/com/teragrep/pth10/DefaultTimeFormatTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10; import com.teragrep.pth10.ast.DefaultTimeFormat; diff --git a/src/test/java/com/teragrep/pth10/DynatraceTestAPICallback.java b/src/test/java/com/teragrep/pth10/DynatraceTestAPICallback.java index f8a47ee..cdf48a3 100644 --- a/src/test/java/com/teragrep/pth10/DynatraceTestAPICallback.java +++ b/src/test/java/com/teragrep/pth10/DynatraceTestAPICallback.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -54,9 +54,11 @@ import java.util.regex.Pattern; public class DynatraceTestAPICallback implements ExpectationResponseCallback { + public DynatraceTestAPICallback() { } + @Override public HttpResponse handle(HttpRequest httpRequest) { final String reqString = httpRequest.getBodyAsString(); @@ -64,14 +66,18 @@ public HttpResponse handle(HttpRequest httpRequest) { int invalidCount = 0; int statusCode = 500; //500=server error, 202=all lines ok, 400=some lines may be ok - Pattern p = Pattern.compile(".*(\\.[^()]*)*\\sgauge," + - "min=\\d+(\\.\\d+)?,max=\\d+(\\.\\d+)?,sum=\\d+(\\.\\d+)?,count=\\d+\\s\\d+\n" + - "#.*\\sgauge\\sdt\\.meta\\.displayName=.*,\\sdt\\.meta\\.description=.*,\\sdt\\.meta\\.unit=.*"); + Pattern p = Pattern + .compile( + ".*(\\.[^()]*)*\\sgauge," + + "min=\\d+(\\.\\d+)?,max=\\d+(\\.\\d+)?,sum=\\d+(\\.\\d+)?,count=\\d+\\s\\d+\n" + + "#.*\\sgauge\\sdt\\.meta\\.displayName=.*,\\sdt\\.meta\\.description=.*,\\sdt\\.meta\\.unit=.*" + ); Matcher m = p.matcher(reqString); if (m.matches()) { validCount++; - } else { + } + else { invalidCount++; } @@ -79,8 +85,11 @@ public HttpResponse handle(HttpRequest httpRequest) { statusCode = 202; - return HttpResponse.response("{\"error\": null, " + - "\"linesValid\": " + validCount +", " + - "\"linesInvalid\": " + invalidCount + "}").withStatusCode(statusCode); + return HttpResponse + .response( + "{\"error\": null, " + "\"linesValid\": " + validCount + ", " + "\"linesInvalid\": " + + invalidCount + "}" + ) + .withStatusCode(statusCode); } } diff --git a/src/test/java/com/teragrep/pth10/EarliestLatestTest.java b/src/test/java/com/teragrep/pth10/EarliestLatestTest.java index 8c1336e..bd02daf 100644 --- a/src/test/java/com/teragrep/pth10/EarliestLatestTest.java +++ b/src/test/java/com/teragrep/pth10/EarliestLatestTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -50,7 +50,6 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.functions; -import org.apache.spark.sql.streaming.StreamingQueryException; import org.apache.spark.sql.types.DataTypes; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -72,14 +71,13 @@ import static org.junit.jupiter.api.Assertions.*; /** - * Tests time-related aspects of the project, - * such as TimeStatement. - * Uses streaming datasets + * Tests time-related aspects of the project, such as TimeStatement. Uses streaming datasets * * @author eemhu */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class EarliestLatestTest { + private static final Logger LOGGER = LoggerFactory.getLogger(EarliestLatestTest.class); // Use this file for dataset initialization @@ -103,148 +101,235 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- // FIXME: earliest-latest defaulting removed and default tests set to Disabled to fix issue #351 - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void earliestLatestTest1() { String query = "index=strawberry earliest=-10y OR index=seagull"; - this.streamingTestUtil.performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), - res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + this.streamingTestUtil + .performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertTrue(indexAsList.contains("strawberry")); assertFalse(indexAsList.contains("seagull")); }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void earliestLatestTest2( ) { + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void earliestLatestTest2() { String query = "index=strawberry OR index=seagull | stats count(_raw) by index"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertTrue(indexAsList.contains("strawberry")); assertTrue(indexAsList.contains("seagull")); }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void earliestLatestTest3( ) { + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void earliestLatestTest3() { String query = "index=strawberry OR index=seagull earliest=-10y | stats count(_raw) by index"; - this.streamingTestUtil.performDPLTest(query, this.testFile,setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + this.streamingTestUtil + .performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); - assertTrue(indexAsList.contains("strawberry")); - assertTrue(indexAsList.contains("seagull")); - }); + assertTrue(indexAsList.contains("strawberry")); + assertTrue(indexAsList.contains("seagull")); + }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void earliestLatestTest4( ) { + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void earliestLatestTest4() { String query = "earliest=-10y index=strawberry OR index=seagull | stats count(_raw) by index"; - this.streamingTestUtil.performDPLTest(query, this.testFile,setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + this.streamingTestUtil + .performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); - assertTrue(indexAsList.contains("strawberry")); - assertTrue(indexAsList.contains("seagull")); - }); + assertTrue(indexAsList.contains("strawberry")); + assertTrue(indexAsList.contains("seagull")); + }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void earliestLatestTest5( ) { + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void earliestLatestTest5() { String query = "earliest=-10y index=strawberry OR (index=seagull latest=-10y) | stats count(_raw) by index"; - this.streamingTestUtil.performDPLTest(query, this.testFile,setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + this.streamingTestUtil + .performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); - assertTrue(indexAsList.contains("strawberry")); - assertFalse(indexAsList.contains("seagull")); - }); + assertTrue(indexAsList.contains("strawberry")); + assertFalse(indexAsList.contains("seagull")); + }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void earliestLatestTest6( ) { + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void earliestLatestTest6() { String query = "earliest=-10y index=strawberry OR index=seagull latest=-10y | stats count(_raw) by index"; - this.streamingTestUtil.performDPLTest(query, this.testFile,setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - assertFalse(indexAsList.contains("strawberry")); - assertFalse(indexAsList.contains("seagull")); - }); + this.streamingTestUtil + .performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + assertFalse(indexAsList.contains("strawberry")); + assertFalse(indexAsList.contains("seagull")); + }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void earliestLatestTest7( ) { + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void earliestLatestTest7() { String query = "earliest=-10y index=strawberry OR index=seagull latest=-1y | stats count(_raw) by index"; - this.streamingTestUtil.performDPLTest(query, this.testFile,setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + this.streamingTestUtil + .performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); - assertTrue(indexAsList.contains("strawberry")); - assertTrue(indexAsList.contains("seagull")); - }); + assertTrue(indexAsList.contains("strawberry")); + assertTrue(indexAsList.contains("seagull")); + }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void earliestLatestTest8( ) { + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void earliestLatestTest8() { String query = "earliest=-20y index=strawberry OR index=seagull earliest=-1d | stats count(_raw) by index"; - this.streamingTestUtil.performDPLTest(query, this.testFile,setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - assertFalse(indexAsList.contains("strawberry")); - assertFalse(indexAsList.contains("seagull")); - }); + this.streamingTestUtil + .performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + assertFalse(indexAsList.contains("strawberry")); + assertFalse(indexAsList.contains("seagull")); + }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void earliestLatestTest9( ) { + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void earliestLatestTest9() { String query = "earliest=-20y index=strawberry OR index=seagull earliest=now | stats count(_raw) by index"; - this.streamingTestUtil.performDPLTest(query, this.testFile,setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - assertFalse(indexAsList.contains("strawberry")); - assertFalse(indexAsList.contains("seagull")); - }); + this.streamingTestUtil + .performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + assertFalse(indexAsList.contains("strawberry")); + assertFalse(indexAsList.contains("seagull")); + }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void earliestLatestTest10( ) { + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void earliestLatestTest10() { String query = "earliest=-20y index=strawberry OR index=seagull latest=now | stats count(_raw) by index"; - this.streamingTestUtil.performDPLTest(query, this.testFile,setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { - List indexAsList = res.select("index").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + this.streamingTestUtil + .performDPLTest(query, this.testFile, setTimeDifferenceToSameAsDate("2023-01-01 12:00:00"), res -> { + List indexAsList = res + .select("index") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); - assertTrue(indexAsList.contains("strawberry")); - assertTrue(indexAsList.contains("seagull")); - }); + assertTrue(indexAsList.contains("strawberry")); + assertTrue(indexAsList.contains("seagull")); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void defaultFormatTest() { // MM/dd/yyyy:HH:mm:ss 2013-07-15 10:01:50 String query = "(index=strawberry OR index=seagull) AND earliest=03/15/2014:00:00:00"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { - List time = res.select("_time").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List time = res + .select("_time") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals("2014-04-15 08:23:17", time.get(0)); assertEquals("2014-03-15 21:54:14", time.get(1)); @@ -252,19 +337,32 @@ public void defaultFormatTest() { // MM/dd/yyyy:HH:mm:ss 2013-07-15 10:01:50 } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void defaultFormatInvalidInputTest() { // MM/dd/yyyy:HH:mm:ss 2013-07-15 10:01:50 String query = "(index=strawberry OR index=seagull) AND earliest=31/31/2014:00:00:00"; - RuntimeException sqe = this.streamingTestUtil.performThrowingDPLTest(RuntimeException.class, query, this.testFile, res -> {}); + RuntimeException sqe = this.streamingTestUtil + .performThrowingDPLTest(RuntimeException.class, query, this.testFile, res -> { + }); assertEquals("TimeQualifier conversion error: <31/31/2014:00:00:00> can't be parsed.", sqe.getMessage()); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void ISO8601ZonedFormatTest() { // '2011-12-03T10:15:30+01:00' String query = "(index=strawberry OR index=seagull) AND earliest=2014-03-15T00:00:00+03:00"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { - List time = res.select("_time").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List time = res + .select("_time") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals("2014-04-15 08:23:17", time.get(0)); assertEquals("2014-03-15 21:54:14", time.get(1)); @@ -272,11 +370,19 @@ public void ISO8601ZonedFormatTest() { // '2011-12-03T10:15:30+01:00' } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void ISO8601WithoutZoneFormatTest() { // '2011-12-03T10:15:30+01:00' String query = "(index=strawberry OR index=seagull) AND earliest=2014-03-15T00:00:00"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { - List time = res.select("_time").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List time = res + .select("_time") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals("2014-04-15 08:23:17", time.get(0)); assertEquals("2014-03-15 21:54:14", time.get(1)); @@ -284,18 +390,29 @@ public void ISO8601WithoutZoneFormatTest() { // '2011-12-03T10:15:30+01:00' } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void OverflowTest( ) { + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void OverflowTest() { String query = "index=strawberry latest=-3644444444444444d"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { - List time = res.select("_time").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List time = res + .select("_time") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertTrue(time.isEmpty()); }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void OverflowTest2( ) { + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void OverflowTest2() { String query = "index=strawberry earliest=-1000y@y latest=+3644444444444444d"; this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { String maxTime = res.agg(functions.max("_time")).first().getString(0); @@ -307,8 +424,11 @@ public void OverflowTest2( ) { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void TimeformatTest1( ) { + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void TimeformatTest1() { String query = "index=strawberry timeformat=%s earliest=0"; this.streamingTestUtil.performDPLTest(query, this.epochTestFile, res -> { // epoch test data contains values from 1970-01-01 till 2050-03-15 @@ -321,8 +441,11 @@ public void TimeformatTest1( ) { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void TimeformatTest2( ) { + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void TimeformatTest2() { String query = "index=strawberry timeformat=\"%Y-%m-%d-%H-%M-%S\" earliest=2030-01-01-00-00-00 latest=2040-01-01-00-00-00"; this.streamingTestUtil.performDPLTest(query, this.epochTestFile, res -> { // epoch test data contains values from 1970-01-01 till 2050-03-15 @@ -368,7 +491,6 @@ public void RelativeTimestampMinutesTest() { // Expected result final long expected = i.plus(amount, ChronoUnit.MINUTES).getEpochSecond(); - RelativeTimeParser rtParser = new RelativeTimeParser(); units.forEach(unit -> { String relativeTimestamp = "+" + amount + unit; //+100min etc. @@ -389,7 +511,6 @@ public void RelativeTimestampHoursTest() { // Expected result final long expected = i.plus(amount, ChronoUnit.HOURS).getEpochSecond(); - RelativeTimeParser rtParser = new RelativeTimeParser(); units.forEach(unit -> { String relativeTimestamp = "+" + amount + unit; //+100hour etc. @@ -410,7 +531,6 @@ public void RelativeTimestampDaysTest() { // Expected result final long expected = i.plus(amount, ChronoUnit.DAYS).getEpochSecond(); - RelativeTimeParser rtParser = new RelativeTimeParser(); units.forEach(unit -> { String relativeTimestamp = "+" + amount + unit; //+100d etc. @@ -431,7 +551,6 @@ public void RelativeTimestampWeeksTest() { // Expected result final long expected = now.plusWeeks(amount).atZone(ZoneId.systemDefault()).toInstant().getEpochSecond(); - RelativeTimeParser rtParser = new RelativeTimeParser(); units.forEach(unit -> { String relativeTimestamp = "+" + amount + unit; //+100min etc. @@ -472,7 +591,6 @@ public void RelativeTimestampYearsTest() { // Expected result final long expected = now.plusYears(amount).atZone(ZoneId.systemDefault()).toInstant().getEpochSecond(); - RelativeTimeParser rtParser = new RelativeTimeParser(); units.forEach(unit -> { String relativeTimestamp = "+" + amount + unit; //+100min etc. @@ -525,7 +643,12 @@ public void RelativeTimestampOverflowPositiveTest() { // Amount to add final long v = Long.MAX_VALUE; - final long expected = Instant.ofEpochMilli(v).atZone(ZoneId.systemDefault()).withYear(9999).toInstant().getEpochSecond(); + final long expected = Instant + .ofEpochMilli(v) + .atZone(ZoneId.systemDefault()) + .withYear(9999) + .toInstant() + .getEpochSecond(); // positive overflow epoch should be long max value RelativeTimeParser rtParser = new RelativeTimeParser(); @@ -548,9 +671,12 @@ public void RelativeTimestampOverflowNegativeTest() { } @Test - @Disabled(value="Should be changed to a dataframe test") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void defaultEarliestLatestTest1( ) { // '2011-12-03T10:15:30+01:00' + @Disabled(value = "Should be changed to a dataframe test") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void defaultEarliestLatestTest1() { // '2011-12-03T10:15:30+01:00' String query = "(index=strawberry OR index=seagull)"; this.streamingTestUtil.getCtx().setDplDefaultEarliest(-1L); this.streamingTestUtil.getCtx().setDplDefaultLatest(1L); @@ -567,8 +693,11 @@ public void defaultEarliestLatestTest1( ) { // '2011-12-03T10:15:30+01:00' } @Test - @Disabled(value="Should be changed to a dataframe test") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Disabled(value = "Should be changed to a dataframe test") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void defaultEarliestLatestTest2() { // '2011-12-03T10:15:30+01:00' String query = "(index=strawberry earliest=2014-03-15T21:54:14+02:00) OR index=seagull)"; this.streamingTestUtil.getCtx().setDplDefaultEarliest(-1L); @@ -586,15 +715,23 @@ public void defaultEarliestLatestTest2() { // '2011-12-03T10:15:30+01:00' } @Test - @Disabled(value="Should be changed to a dataframe test") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Disabled(value = "Should be changed to a dataframe test") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void defaultEarliestLatestTest3() { // '2011-12-03T10:15:30+01:00' String query = "(index=strawberry earliest=2014-03-15T21:54:14+02:00) OR (index=seagull earliest=2014-04-15T08:23:17+02:00))"; this.streamingTestUtil.getCtx().setDplDefaultEarliest(0L); this.streamingTestUtil.getCtx().setDplDefaultLatest(1678713103L); this.streamingTestUtil.performDPLTest(query, this.testFile, res -> { - List time = res.select("_time").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List time = res + .select("_time") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(2, time.size()); assertEquals("2014-04-15 08:23:17", time.get(0)); @@ -609,9 +746,12 @@ public void defaultEarliestLatestTest3() { // '2011-12-03T10:15:30+01:00' } @Test - @Disabled(value="Should be changed to a dataframe test") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void defaultEarliestLatestTest4( ) { + @Disabled(value = "Should be changed to a dataframe test") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void defaultEarliestLatestTest4() { String query = "(earliest=1900-01-01T00:00:00Z latest=1960-01-01T00:00:00Z) OR ((index=seagull earliest=1970-01-01T00:00:00Z latest=2100-01-01T00:00:00Z)) OR index=strawberry earliest=1950-01-01T00:00:00Z"; this.streamingTestUtil.getCtx().setDplDefaultEarliest(0L); this.streamingTestUtil.getCtx().setDplDefaultLatest(1678711652L); @@ -620,40 +760,30 @@ public void defaultEarliestLatestTest4( ) { }); - final String expectedXml = "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - "" + - ""; - final String expectedSpark = "(((((_time >= from_unixtime(-2208994789, yyyy-MM-dd HH:mm:ss)) AND (_time <= from_unixtime(-315626400, yyyy-MM-dd HH:mm:ss)))" + - " OR ((index RLIKE ^seagull$ AND (_time >= from_unixtime(-7200, yyyy-MM-dd HH:mm:ss))) AND (_time <= from_unixtime(4102437600, yyyy-MM-dd HH:mm:ss))))" + - " OR (index RLIKE ^strawberry$ AND (_time <= from_unixtime(1678711652, yyyy-MM-dd HH:mm:ss)))) AND (_time >= from_unixtime(-631159200, yyyy-MM-dd HH:mm:ss)))"; + final String expectedXml = "" + "" + "" + "" + + "" + + "" + "" + "" + "" + + "" + + "" + "" + + "" + "" + "" + "" + + "" + + "" + "" + "" + + "" + ""; + final String expectedSpark = "(((((_time >= from_unixtime(-2208994789, yyyy-MM-dd HH:mm:ss)) AND (_time <= from_unixtime(-315626400, yyyy-MM-dd HH:mm:ss)))" + + " OR ((index RLIKE ^seagull$ AND (_time >= from_unixtime(-7200, yyyy-MM-dd HH:mm:ss))) AND (_time <= from_unixtime(4102437600, yyyy-MM-dd HH:mm:ss))))" + + " OR (index RLIKE ^strawberry$ AND (_time <= from_unixtime(1678711652, yyyy-MM-dd HH:mm:ss)))) AND (_time >= from_unixtime(-631159200, yyyy-MM-dd HH:mm:ss)))"; assertEquals(expectedSpark, this.streamingTestUtil.getCtx().getSparkQuery()); assertEquals(expectedXml, this.streamingTestUtil.getCtx().getArchiveQuery()); } @Test - @Disabled(value="Should be changed to a dataframe test") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void defaultEarliestLatestTest5( ) { + @Disabled(value = "Should be changed to a dataframe test") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void defaultEarliestLatestTest5() { String query = "index=strawberry OR index=seagull earliest=2020-01-01T00:00:00Z"; this.streamingTestUtil.getCtx().setDplDefaultEarliest(0L); this.streamingTestUtil.getCtx().setDplDefaultLatest(1L); @@ -670,11 +800,14 @@ public void defaultEarliestLatestTest5( ) { } @Test - @Disabled(value="Should be changed to a dataframe test") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void defaultEarliestLatestTest6( ) { - String query = "index=seagull earliest=1970-01-01T00:00:00.000+02:00 OR " + - "index=strawberry earliest=2010-12-31T00:00:00.000+02:00"; + @Disabled(value = "Should be changed to a dataframe test") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void defaultEarliestLatestTest6() { + String query = "index=seagull earliest=1970-01-01T00:00:00.000+02:00 OR " + + "index=strawberry earliest=2010-12-31T00:00:00.000+02:00"; this.streamingTestUtil.getCtx().setDplDefaultEarliest(0L); this.streamingTestUtil.getCtx().setDplDefaultLatest(1L); @@ -689,11 +822,13 @@ public void defaultEarliestLatestTest6( ) { assertEquals(expectedXml, this.streamingTestUtil.getCtx().getArchiveQuery()); } - @Test - @Disabled(value="Should be changed to a dataframe test") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void defaultEarliestLatestTest7( ) { + @Disabled(value = "Should be changed to a dataframe test") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void defaultEarliestLatestTest7() { String query = "(index=strawberry earliest=2019-01-01T00:00:00Z) AND (index=seagull) earliest=2009-01-01T00:00:00Z"; this.streamingTestUtil.getCtx().setDplDefaultEarliest(0L); this.streamingTestUtil.getCtx().setDplDefaultLatest(1L); @@ -708,9 +843,12 @@ public void defaultEarliestLatestTest7( ) { } @Test - @Disabled(value="Should be changed to a dataframe test") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void defaultEarliestLatestTest8( ) { + @Disabled(value = "Should be changed to a dataframe test") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void defaultEarliestLatestTest8() { String query = "(index=strawberry earliest=2019-01-01T00:00:00Z) AND (index=seagull) earliest=2009-01-01T00:00:00Z"; this.streamingTestUtil.getCtx().setDplDefaultEarliest(0L); this.streamingTestUtil.getCtx().setDplDefaultLatest(1L); @@ -725,8 +863,11 @@ public void defaultEarliestLatestTest8( ) { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchIssue383Test( ) { + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchIssue383Test() { // test issue 383 // Case: can't match XYZ="yes asd" _raw column, except by omitting double quotes entirely String query = " index=abc earliest=\"01/01/2022:00:00:00\" latest=\"01/02/2022:00:00:00\" \"XYZ=\\\"yes asd\\\"\" "; @@ -737,8 +878,11 @@ public void searchIssue383Test( ) { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchIssue383_2Test( ) { + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchIssue383_2Test() { // test issue 383 // Case: can't match XYZ="yes asd" _raw column, except by omitting double quotes entirely String query = " index=abc \"XYZ=\\\"yes asd\\\"\" "; @@ -751,9 +895,12 @@ public void searchIssue383_2Test( ) { } @Test - @Disabled(value="Broken on pth-03") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchIssue384Test( ) { + @Disabled(value = "Broken on pth-03") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchIssue384Test() { // test issue 384 // FIXME: Parser error: line 1:35 token recognition error at: '"[17/Aug/2023:08:03:55.441546368 +0300] conn=' // |makeresultscount=1|eval_raw=917818 @@ -762,14 +909,20 @@ public void searchIssue384Test( ) { Row r = res.select("_raw").first(); String s = r.getAs(0).toString(); - assertEquals("[17/Aug/2023:08:03:55.441546368 +0300] conn=917818 op=5 EXT oid=\"2.16.840.1.113730.3.5.12\" name=\"replication-multimaster-extop\"", s); + assertEquals( + "[17/Aug/2023:08:03:55.441546368 +0300] conn=917818 op=5 EXT oid=\"2.16.840.1.113730.3.5.12\" name=\"replication-multimaster-extop\"", + s + ); }); } - @Disabled(value="Broken on pth-03") /* FIXME: Parser can't handle = symbol inside quotes */ + @Disabled(value = "Broken on pth-03") /* FIXME: Parser can't handle = symbol inside quotes */ @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchIssue384_2Test( ) { + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchIssue384_2Test() { // test issue 384 // works with escaping '=' symbols String query = " | makeresults count=1 | eval _raw=\"[10/Jan/2020:05:03:55.441546368 +0300] xyz\\=654321 ab\\=2 DEF pid\\=\\\"1.23.456.7.899999.1.2.34\\\" key\\=\\\"random-words-here\\\"\""; @@ -777,25 +930,34 @@ public void searchIssue384_2Test( ) { Row r = res.select("_raw").first(); String s = r.getAs(0).toString(); - assertEquals("[17/Aug/2023:08:03:55.441546368 +0300] conn=917818 op=5 EXT oid=\"2.16.840.1.113730.3.5.12\" name=\"replication-multimaster-extop\"", s); + assertEquals( + "[17/Aug/2023:08:03:55.441546368 +0300] conn=917818 op=5 EXT oid=\"2.16.840.1.113730.3.5.12\" name=\"replication-multimaster-extop\"", + s + ); }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchIssue382Test( ) { + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchIssue382Test() { // test issue 382 // case: index=* earliest=x latest=y abcdef and index=*abc* earliest=x latest=y abcdef match differently (data/no data) - this.streamingTestUtil.performDPLTest("index=*g*", this.testFile,res -> { - if (res.count() == 0){ + this.streamingTestUtil.performDPLTest("index=*g*", this.testFile, res -> { + if (res.count() == 0) { fail("(index=*g*) Expected result rows, instead got 0"); } }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchIssue382Test2( ) { + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchIssue382Test2() { // test issue 382 // index=* case this.streamingTestUtil.performDPLTest("index=*", this.testFile, res -> { diff --git a/src/test/java/com/teragrep/pth10/EventstatsTransformationTest.java b/src/test/java/com/teragrep/pth10/EventstatsTransformationTest.java index a7066df..701adc6 100644 --- a/src/test/java/com/teragrep/pth10/EventstatsTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/EventstatsTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,28 +61,26 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** - * Tests for the EventstatsTransformation implementation - * Uses streaming datasets + * Tests for the EventstatsTransformation implementation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class EventstatsTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(EventstatsTransformationTest.class); private final String testFile = "src/test/resources/eventstatsTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -102,74 +100,116 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // Standard eventstats, without wildcards in WITH-clause + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // Standard eventstats, without wildcards in WITH-clause public void eventstats_test_NoByClause() { - streamingTestUtil.performDPLTest( - "index=index_A | eventstats avg(offset) AS avg_offset", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, avg_offset]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - List listOfOffset = ds.select("avg_offset").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(1, listOfOffset.size()); - assertEquals("5.5", listOfOffset.get(0)); - } - ); + streamingTestUtil.performDPLTest("index=index_A | eventstats avg(offset) AS avg_offset", testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, avg_offset]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + List listOfOffset = ds + .select("avg_offset") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(1, listOfOffset.size()); + assertEquals("5.5", listOfOffset.get(0)); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // Standard eventstats, without wildcards in WITH-clause + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // Standard eventstats, without wildcards in WITH-clause public void eventstats_test_WithByClause() { - streamingTestUtil.performDPLTest( - "index=index_A | eventstats avg(offset) AS avg_offset BY sourcetype", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, avg_offset]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - List listOfOffset = ds.select("avg_offset").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + streamingTestUtil + .performDPLTest("index=index_A | eventstats avg(offset) AS avg_offset BY sourcetype", testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, avg_offset]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + List listOfOffset = ds + .select("avg_offset") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(2, listOfOffset.size()); assertEquals("6.0", listOfOffset.get(0)); assertEquals("5.0", listOfOffset.get(1)); - } - ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // count with implied wildcard + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // count with implied wildcard public void eventstats_test_count() { - streamingTestUtil.performDPLTest( - "index=index_A | eventstats count", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, count]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - List listOfCount = ds.select("count").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(1, listOfCount.size()); - } - ); + streamingTestUtil.performDPLTest("index=index_A | eventstats count", testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, count]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + List listOfCount = ds + .select("count") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(1, listOfCount.size()); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // multiple aggregation functions + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // multiple aggregation functions public void eventstats_test_multi() { - streamingTestUtil.performDPLTest( - "index=index_A | eventstats count avg(offset) stdevp(offset)", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, count, avg(offset), stdevp(offset)]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - List listOfCount = ds.select("count").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + streamingTestUtil + .performDPLTest("index=index_A | eventstats count avg(offset) stdevp(offset)", testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, count, avg(offset), stdevp(offset)]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + List listOfCount = ds + .select("count") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(1, listOfCount.size()); - List listOfAvg = ds.select("avg(offset)").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List listOfAvg = ds + .select("avg(offset)") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(1, listOfAvg.size()); - List listOfStdevp = ds.select("stdevp(offset)").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List listOfStdevp = ds + .select("stdevp(offset)") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(1, listOfStdevp.size()); - } - ); + }); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/FillnullTransformationTest.java b/src/test/java/com/teragrep/pth10/FillnullTransformationTest.java index 5c8ceb7..d8de60f 100644 --- a/src/test/java/com/teragrep/pth10/FillnullTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/FillnullTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -60,22 +60,21 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class FillnullTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(FillnullTransformationTest.class); // data has 3 empty strings ("") and 1 literal null in _raw column private final String testFile = "src/test/resources/fillnull/fillnull0*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -101,72 +100,70 @@ void tearDown() { // base query, no optional params @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void fillnullBasicQueryTest() { - streamingTestUtil.performDPLTest( - "index=* | fillnull", - testFile, - ds -> { - long zeroesCount = ds.where(functions.col("_raw").equalTo(functions.lit("0"))).count(); - assertEquals(4, zeroesCount); - }); + streamingTestUtil.performDPLTest("index=* | fillnull", testFile, ds -> { + long zeroesCount = ds.where(functions.col("_raw").equalTo(functions.lit("0"))).count(); + assertEquals(4, zeroesCount); + }); } // explicit field param @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void fillnullExplicitFieldTest() { - streamingTestUtil.performDPLTest( - "index=* | fillnull _raw", - testFile, - ds -> { - long zeroesCount = ds.where(functions.col("_raw").equalTo(functions.lit("0"))).count(); - assertEquals(4, zeroesCount); - }); + streamingTestUtil.performDPLTest("index=* | fillnull _raw", testFile, ds -> { + long zeroesCount = ds.where(functions.col("_raw").equalTo(functions.lit("0"))).count(); + assertEquals(4, zeroesCount); + }); } // multiple explicit field params @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void fillnullMultipleExplicitFieldsTest() { - streamingTestUtil.performDPLTest( - "index=* | fillnull _raw, source", - testFile, - ds -> { - long zeroesCount = ds.where(functions.col("_raw").equalTo(functions.lit("0"))).count(); - long zeroesCount2 = ds.where(functions.col("source").equalTo(functions.lit("0"))).count(); - assertEquals(4, zeroesCount); - assertEquals(2, zeroesCount2); - }); + streamingTestUtil.performDPLTest("index=* | fillnull _raw, source", testFile, ds -> { + long zeroesCount = ds.where(functions.col("_raw").equalTo(functions.lit("0"))).count(); + long zeroesCount2 = ds.where(functions.col("source").equalTo(functions.lit("0"))).count(); + assertEquals(4, zeroesCount); + assertEquals(2, zeroesCount2); + }); } // non-existent fields as param @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void fillnullNonexistentFieldParamTest() { - streamingTestUtil.performDPLTest( - "index=* | fillnull fakeField", - testFile, - ds -> { - // for a field that does not exist, create it and fill with filler value - // => all values will be zero - long zeroesCount = ds.where(functions.col("fakeField").equalTo(functions.lit("0"))).count(); - assertEquals(ds.count(), zeroesCount); - }); + streamingTestUtil.performDPLTest("index=* | fillnull fakeField", testFile, ds -> { + // for a field that does not exist, create it and fill with filler value + // => all values will be zero + long zeroesCount = ds.where(functions.col("fakeField").equalTo(functions.lit("0"))).count(); + assertEquals(ds.count(), zeroesCount); + }); } // field param and custom filler value @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void fillnullCustomFillerStringTest() { - streamingTestUtil.performDPLTest( - "index=* | fillnull value=\"\" _raw", - testFile, - ds -> { - long zeroesCount = ds.where(functions.col("_raw").equalTo(functions.lit(""))).count(); - assertEquals(4, zeroesCount); - }); + streamingTestUtil.performDPLTest("index=* | fillnull value=\"\" _raw", testFile, ds -> { + long zeroesCount = ds.where(functions.col("_raw").equalTo(functions.lit(""))).count(); + assertEquals(4, zeroesCount); + }); } } - - diff --git a/src/test/java/com/teragrep/pth10/FormatTransformationTest.java b/src/test/java/com/teragrep/pth10/FormatTransformationTest.java index 19544e2..4b0f912 100644 --- a/src/test/java/com/teragrep/pth10/FormatTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/FormatTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -62,21 +62,20 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class FormatTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(FormatTransformationTest.class); private final String testFile = "src/test/resources/strcatTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -96,250 +95,129 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void formatTransformationTest0() { String q = "index=index_A | format "; - streamingTestUtil.performDPLTest(q, testFile, res -> { - // Check if result contains the column that was created for format result - assertTrue(Arrays.toString(res.columns()).contains("search")); - - // List of expected values for the format destination field - List expectedValues = Collections.singletonList( - "( " + - "( " + - "_time=\"2023-09-06 11:22:31.0\" " + - "AND " + - "id=\"1\" " + - "AND " + - "_raw=\"raw 01\" " + - "AND " + - "index=\"index_A\" " + - "AND " + - "sourcetype=\"A:X:0\" " + - "AND " + - "host=\"host\" " + - "AND " + - "source=\"input\" " + - "AND " + - "partition=\"0\" " + - "AND offset=\"1\"" + - " ) " + - "OR " + - "( " + - "_time=\"2023-09-06 12:22:31.0\" " + - "AND " + - "id=\"2\" " + - "AND " + - "_raw=\"raw 02\" " + - "AND " + - "index=\"index_A\" " + - "AND " + - "sourcetype=\"A:X:0\" " + - "AND " + - "host=\"host\" " + - "AND " + - "source=\"input\" " + - "AND " + - "partition=\"0\" " + - "AND " + - "offset=\"2\" " + - ") " + - "OR " + - "( " + - "_time=\"2023-09-06 13:22:31.0\" " + - "AND id=\"3\" " + - "AND " + - "_raw=\"raw 03\" " + - "AND " + - "index=\"index_A\" " + - "AND " + - "sourcetype=\"A:Y:0\" " + - "AND " + - "host=\"host\" " + - "AND " + - "source=\"input\" " + - "AND " + - "partition=\"0\" " + - "AND " + - "offset=\"3\" " + - ") " + - "OR " + - "( " + - "_time=\"2023-09-06 14:22:31.0\" " + - "AND " + - "id=\"4\" " + - "AND " + - "_raw=\"raw 04\" " + - "AND " + - "index=\"index_A\" " + - "AND " + - "sourcetype=\"A:Y:0\" " + - "AND " + - "host=\"host\" " + - "AND " + - "source=\"input\" " + - "AND " + - "partition=\"0\" " + - "AND " + - "offset=\"4\" " + - ") " + - "OR " + - "( " + - "_time=\"2023-09-06 15:22:31.0\" " + - "AND " + - "id=\"5\" " + - "AND " + - "_raw=\"raw 05\" " + - "AND " + - "index=\"index_A\" " + - "AND " + - "sourcetype=\"A:Y:0\" " + - "AND " + - "host=\"host\" " + - "AND " + - "source=\"input\" " + - "AND " + - "partition=\"0\" " + - "AND " + - "offset=\"5\" " + - ") " + - ")" - ); - - // Destination field from result dataset - List searchAsList = res - .select("search") - .collectAsList() - .stream() - .map(r -> r.getString(0)) - .collect(Collectors.toList()); - - // Assert search field contents as equals with expected contents - assertEquals(expectedValues, searchAsList); - }); + streamingTestUtil + .performDPLTest( + q, testFile, res -> { + // Check if result contains the column that was created for format result + assertTrue(Arrays.toString(res.columns()).contains("search")); + + // List of expected values for the format destination field + List expectedValues = Collections + .singletonList( + "( " + "( " + "_time=\"2023-09-06 11:22:31.0\" " + "AND " + "id=\"1\" " + + "AND " + "_raw=\"raw 01\" " + "AND " + "index=\"index_A\" " + + "AND " + "sourcetype=\"A:X:0\" " + "AND " + "host=\"host\" " + + "AND " + "source=\"input\" " + "AND " + "partition=\"0\" " + + "AND offset=\"1\"" + " ) " + "OR " + "( " + + "_time=\"2023-09-06 12:22:31.0\" " + "AND " + "id=\"2\" " + "AND " + + "_raw=\"raw 02\" " + "AND " + "index=\"index_A\" " + "AND " + + "sourcetype=\"A:X:0\" " + "AND " + "host=\"host\" " + "AND " + + "source=\"input\" " + "AND " + "partition=\"0\" " + "AND " + + "offset=\"2\" " + ") " + "OR " + "( " + + "_time=\"2023-09-06 13:22:31.0\" " + "AND id=\"3\" " + "AND " + + "_raw=\"raw 03\" " + "AND " + "index=\"index_A\" " + "AND " + + "sourcetype=\"A:Y:0\" " + "AND " + "host=\"host\" " + "AND " + + "source=\"input\" " + "AND " + "partition=\"0\" " + "AND " + + "offset=\"3\" " + ") " + "OR " + "( " + + "_time=\"2023-09-06 14:22:31.0\" " + "AND " + "id=\"4\" " + "AND " + + "_raw=\"raw 04\" " + "AND " + "index=\"index_A\" " + "AND " + + "sourcetype=\"A:Y:0\" " + "AND " + "host=\"host\" " + "AND " + + "source=\"input\" " + "AND " + "partition=\"0\" " + "AND " + + "offset=\"4\" " + ") " + "OR " + "( " + + "_time=\"2023-09-06 15:22:31.0\" " + "AND " + "id=\"5\" " + "AND " + + "_raw=\"raw 05\" " + "AND " + "index=\"index_A\" " + "AND " + + "sourcetype=\"A:Y:0\" " + "AND " + "host=\"host\" " + "AND " + + "source=\"input\" " + "AND " + "partition=\"0\" " + "AND " + + "offset=\"5\" " + ") " + ")" + ); + + // Destination field from result dataset + List searchAsList = res.select("search").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + + // Assert search field contents as equals with expected contents + assertEquals(expectedValues, searchAsList); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void formatTransformationTest1() { String q = "index=index_A | eval a=mvappend(\"1\", \"2\") | format maxresults=1 "; - streamingTestUtil.performDPLTest(q, testFile, res -> { - // Check if result contains the column that was created for format result - assertTrue(Arrays.toString(res.columns()).contains("search")); - - // List of expected values for the format destination field - List expectedValues = Collections.singletonList( - "( " + - "( " + - "_time=\"2023-09-06 11:22:31.0\" " + - "AND " + - "id=\"1\" " + - "AND " + - "_raw=\"raw 01\" " + - "AND " + - "index=\"index_A\" " + - "AND " + - "sourcetype=\"A:X:0\" " + - "AND " + - "host=\"host\" " + - "AND " + - "source=\"input\" " + - "AND " + - "partition=\"0\" " + - "AND " + - "offset=\"1\" " + - "AND " + - "( " + - "a=\"1\" " + - "OR " + - "a=\"2\" " + - ") " + - ") " + - ")" - ); - - // Destination field from result dataset - List searchAsList = res - .select("search") - .collectAsList() - .stream() - .map(r -> r.getString(0)) - .collect(Collectors.toList()); - - // Assert search field contents as equals with expected contents - assertEquals(expectedValues, searchAsList); - }); + streamingTestUtil + .performDPLTest( + q, testFile, res -> { + // Check if result contains the column that was created for format result + assertTrue(Arrays.toString(res.columns()).contains("search")); + + // List of expected values for the format destination field + List expectedValues = Collections + .singletonList( + "( " + "( " + "_time=\"2023-09-06 11:22:31.0\" " + "AND " + "id=\"1\" " + + "AND " + "_raw=\"raw 01\" " + "AND " + "index=\"index_A\" " + + "AND " + "sourcetype=\"A:X:0\" " + "AND " + "host=\"host\" " + + "AND " + "source=\"input\" " + "AND " + "partition=\"0\" " + + "AND " + "offset=\"1\" " + "AND " + "( " + "a=\"1\" " + "OR " + + "a=\"2\" " + ") " + ") " + ")" + ); + + // Destination field from result dataset + List searchAsList = res.select("search").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + + // Assert search field contents as equals with expected contents + assertEquals(expectedValues, searchAsList); + } + ); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void formatTransformationTest2() { String q = "index=index_A | format maxresults=2 \"ROWPRE\" \"COLPRE\" \"COLSEP\" \"COLSUF\"\"ROWSEP\" \"ROWSUF\" "; - streamingTestUtil.performDPLTest(q, testFile, res -> { - // Check if result contains the column that was created for format result - assertTrue(Arrays.toString(res.columns()).contains("search")); - - // List of expected values for the format destination field - List expectedValues = Collections.singletonList( - "ROWPRE " + - "COLPRE " + - "_time=\"2023-09-06 11:22:31.0\" " + - "COLSEP " + - "id=\"1\" " + - "COLSEP " + - "_raw=\"raw 01\" " + - "COLSEP " + - "index=\"index_A\" " + - "COLSEP " + - "sourcetype=\"A:X:0\" " + - "COLSEP " + - "host=\"host\" " + - "COLSEP " + - "source=\"input\" " + - "COLSEP " + - "partition=\"0\" " + - "COLSEP " + - "offset=\"1\"" + - " COLSUF " + - "ROWSEP " + - "COLPRE " + - "_time=\"2023-09-06 12:22:31.0\" " + - "COLSEP " + - "id=\"2\" " + - "COLSEP " + - "_raw=\"raw 02\" " + - "COLSEP " + - "index=\"index_A\" " + - "COLSEP " + - "sourcetype=\"A:X:0\" " + - "COLSEP " + - "host=\"host\" " + - "COLSEP " + - "source=\"input\" " + - "COLSEP " + - "partition=\"0\" " + - "COLSEP " + - "offset=\"2\" " + - "COLSUF " + - "ROWSUF" - ); - - // Destination field from result dataset - List searchAsList = res - .select("search") - .collectAsList() - .stream() - .map(r -> r.getString(0)) - .collect(Collectors.toList()); - - // Assert search field contents as equals with expected contents - assertEquals(expectedValues, searchAsList); - }); + streamingTestUtil + .performDPLTest( + q, testFile, res -> { + // Check if result contains the column that was created for format result + assertTrue(Arrays.toString(res.columns()).contains("search")); + + // List of expected values for the format destination field + List expectedValues = Collections + .singletonList( + "ROWPRE " + "COLPRE " + "_time=\"2023-09-06 11:22:31.0\" " + "COLSEP " + + "id=\"1\" " + "COLSEP " + "_raw=\"raw 01\" " + "COLSEP " + + "index=\"index_A\" " + "COLSEP " + "sourcetype=\"A:X:0\" " + + "COLSEP " + "host=\"host\" " + "COLSEP " + "source=\"input\" " + + "COLSEP " + "partition=\"0\" " + "COLSEP " + "offset=\"1\"" + + " COLSUF " + "ROWSEP " + "COLPRE " + + "_time=\"2023-09-06 12:22:31.0\" " + "COLSEP " + "id=\"2\" " + + "COLSEP " + "_raw=\"raw 02\" " + "COLSEP " + "index=\"index_A\" " + + "COLSEP " + "sourcetype=\"A:X:0\" " + "COLSEP " + "host=\"host\" " + + "COLSEP " + "source=\"input\" " + "COLSEP " + "partition=\"0\" " + + "COLSEP " + "offset=\"2\" " + "COLSUF " + "ROWSUF" + ); + + // Destination field from result dataset + List searchAsList = res.select("search").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + + // Assert search field contents as equals with expected contents + assertEquals(expectedValues, searchAsList); + } + ); } } diff --git a/src/test/java/com/teragrep/pth10/IplocationTransformationTest.java b/src/test/java/com/teragrep/pth10/IplocationTransformationTest.java index ecbc934..3ad5abc 100644 --- a/src/test/java/com/teragrep/pth10/IplocationTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/IplocationTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -48,7 +48,6 @@ import com.teragrep.pth10.ast.commands.transformstatement.iplocation.IplocationGeoIPDataMapper; import com.teragrep.pth10.ast.commands.transformstatement.iplocation.IplocationRirDataMapper; import org.apache.hadoop.conf.Configuration; -import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.streaming.StreamingQueryException; import org.apache.spark.sql.types.DataTypes; @@ -69,28 +68,35 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class IplocationTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(IplocationTransformationTest.class); private final String testFile = "src/test/resources/IplocationTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("otherIP", DataTypes.StringType, true, new MetadataBuilder().build()) - } - ); - - private final String[] GEOIP_MINIMAL_COLUMNS = new String[]{"country", "region", "city", "lat", "lon"}; - private final String[] GEOIP_FULL_COLUMNS = new String[]{"country", "region", "city", "metroCode", "continent", "lat", "lon"}; - private final String[] RIR_COLUMNS = new String[]{"operator", "country"}; - private final String[] COUNTRY_COLUMNS = new String[]{"country", "continent"}; + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("otherIP", DataTypes.StringType, true, new MetadataBuilder().build()) + }); + + private final String[] GEOIP_MINIMAL_COLUMNS = new String[] { + "country", "region", "city", "lat", "lon" + }; + private final String[] GEOIP_FULL_COLUMNS = new String[] { + "country", "region", "city", "metroCode", "continent", "lat", "lon" + }; + private final String[] RIR_COLUMNS = new String[] { + "operator", "country" + }; + private final String[] COUNTRY_COLUMNS = new String[] { + "country", "continent" + }; private StreamingTestUtil streamingTestUtil; @@ -110,14 +116,19 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - @DisabledIfSystemProperty(named="skipGeoLiteTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + @DisabledIfSystemProperty( + named = "skipGeoLiteTest", + matches = "true" + ) public void iplocationTest_GeoLite2City_1() { String mmdbPath = "/usr/share/GeoIP/GeoLite2-City.mmdb"; String[] expectedCols = GEOIP_MINIMAL_COLUMNS; @@ -128,13 +139,20 @@ public void iplocationTest_GeoLite2City_1() { LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); // GEO DB type, get db mapper - IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper(mmdbPath, this.streamingTestUtil.getCtx().nullValue, - extractMapFromHadoopCfg(this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration())); + IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper( + mmdbPath, + this.streamingTestUtil.getCtx().nullValue, + extractMapFromHadoopCfg( + this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration() + ) + ); // run mapper on ip to assert expected List ips = ds.select(ipColumn, expectedCols).collectAsList(); for (Row ip : ips) { - Map result = assertDoesNotThrow(() -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true)); + Map result = assertDoesNotThrow( + () -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true) + ); for (String col : expectedCols) { assertEquals(result.get(col), ip.getAs(ip.fieldIndex(col))); @@ -144,7 +162,10 @@ public void iplocationTest_GeoLite2City_1() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void iplocationTest_RirDataSample_2() { String mmdbPath = "src/test/resources/rir-data.sample.mmdb"; String[] expectedCols = RIR_COLUMNS; @@ -155,13 +176,20 @@ public void iplocationTest_RirDataSample_2() { LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); // RIR DB type - IplocationRirDataMapper mapper = new IplocationRirDataMapper(mmdbPath, this.streamingTestUtil.getCtx().nullValue, - extractMapFromHadoopCfg(this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration())); + IplocationRirDataMapper mapper = new IplocationRirDataMapper( + mmdbPath, + this.streamingTestUtil.getCtx().nullValue, + extractMapFromHadoopCfg( + this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration() + ) + ); // run mapper on ip to assert expected List ips = ds.select(ipColumn, expectedCols).collectAsList(); for (Row ip : ips) { - Map result = assertDoesNotThrow(() -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true)); + Map result = assertDoesNotThrow( + () -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true) + ); for (String col : expectedCols) { String expected = result.get(col); @@ -172,8 +200,14 @@ public void iplocationTest_RirDataSample_2() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - @DisabledIfSystemProperty(named="skipGeoLiteTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + @DisabledIfSystemProperty( + named = "skipGeoLiteTest", + matches = "true" + ) public void iplocationTest_GeoLite2Country_3() { String mmdbPath = "/usr/share/GeoIP/GeoLite2-Country.mmdb"; String[] expectedCols = COUNTRY_COLUMNS; @@ -184,13 +218,20 @@ public void iplocationTest_GeoLite2Country_3() { LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); // GEO DB type, get db mapper - IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper(mmdbPath, this.streamingTestUtil.getCtx().nullValue, - extractMapFromHadoopCfg(this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration())); + IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper( + mmdbPath, + this.streamingTestUtil.getCtx().nullValue, + extractMapFromHadoopCfg( + this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration() + ) + ); // run mapper on ip to assert expected List ips = ds.select(ipColumn, expectedCols).collectAsList(); for (Row ip : ips) { - Map result = assertDoesNotThrow(() -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true)); + Map result = assertDoesNotThrow( + () -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true) + ); for (String col : expectedCols) { assertEquals(result.get(col), ip.getAs(ip.fieldIndex(col))); @@ -200,8 +241,14 @@ public void iplocationTest_GeoLite2Country_3() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - @DisabledIfSystemProperty(named="skipGeoLiteTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + @DisabledIfSystemProperty( + named = "skipGeoLiteTest", + matches = "true" + ) public void iplocationTest_GeoLite2City_4() { String mmdbPath = "/usr/share/GeoIP/GeoLite2-City.mmdb"; String[] expectedCols = GEOIP_FULL_COLUMNS; @@ -212,13 +259,20 @@ public void iplocationTest_GeoLite2City_4() { LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); // GEO DB type, get db mapper - IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper(mmdbPath, this.streamingTestUtil.getCtx().nullValue, - extractMapFromHadoopCfg(this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration())); + IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper( + mmdbPath, + this.streamingTestUtil.getCtx().nullValue, + extractMapFromHadoopCfg( + this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration() + ) + ); // run mapper on ip to assert expected List ips = ds.select(ipColumn, expectedCols).collectAsList(); for (Row ip : ips) { - Map result = assertDoesNotThrow(() -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true)); + Map result = assertDoesNotThrow( + () -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true) + ); for (String col : expectedCols) { assertEquals(result.get(col), ip.getAs(ip.fieldIndex(col))); @@ -228,101 +282,164 @@ public void iplocationTest_GeoLite2City_4() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - @DisabledIfSystemProperty(named="skipGeoLiteTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + @DisabledIfSystemProperty( + named = "skipGeoLiteTest", + matches = "true" + ) public void iplocationTest_GeoLite2City_InvalidIPAddress_5() { String mmdbPath = "/usr/share/GeoIP/GeoLite2-City.mmdb"; String[] expectedCols = GEOIP_FULL_COLUMNS; String ipColumn = "otherIP"; this.streamingTestUtil.getCatalystVisitor().setIplocationMmdbPath(mmdbPath); - this.streamingTestUtil.performDPLTest("index=index_A | iplocation allfields=true otherIP", this.testFile, ds -> { - LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); - - // GEO DB type, get db mapper - IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper(mmdbPath, this.streamingTestUtil.getCtx().nullValue, - extractMapFromHadoopCfg(this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration())); - - // run mapper on ip to assert expected - List ips = ds.select(ipColumn, expectedCols).collectAsList(); - for (Row ip : ips) { - Map result = assertDoesNotThrow(() -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true)); - - for (String col : expectedCols) { - assertEquals(result.get(col), ip.getAs(ip.fieldIndex(col))); - } - } - }); + this.streamingTestUtil + .performDPLTest("index=index_A | iplocation allfields=true otherIP", this.testFile, ds -> { + LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); + + // GEO DB type, get db mapper + IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper( + mmdbPath, + this.streamingTestUtil.getCtx().nullValue, + extractMapFromHadoopCfg( + this.streamingTestUtil + .getCtx() + .getSparkSession() + .sparkContext() + .hadoopConfiguration() + ) + ); + + // run mapper on ip to assert expected + List ips = ds.select(ipColumn, expectedCols).collectAsList(); + for (Row ip : ips) { + Map result = assertDoesNotThrow( + () -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true) + ); + + for (String col : expectedCols) { + assertEquals(result.get(col), ip.getAs(ip.fieldIndex(col))); + } + } + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - @DisabledIfSystemProperty(named="skipGeoLiteTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + @DisabledIfSystemProperty( + named = "skipGeoLiteTest", + matches = "true" + ) public void iplocationTest_GeoLite2City_InvalidIPAddress_6() { String mmdbPath = "/usr/share/GeoIP/GeoLite2-City.mmdb"; String[] expectedCols = GEOIP_MINIMAL_COLUMNS; String ipColumn = "otherIP"; this.streamingTestUtil.getCatalystVisitor().setIplocationMmdbPath(mmdbPath); - this.streamingTestUtil.performDPLTest("index=index_A | iplocation otherIP allfields=false", this.testFile, ds -> { - LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); - - // GEO DB type, get db mapper - IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper(mmdbPath, this.streamingTestUtil.getCtx().nullValue, - extractMapFromHadoopCfg(this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration())); - - // run mapper on ip to assert expected - List ips = ds.select(ipColumn, expectedCols).collectAsList(); - for (Row ip : ips) { - Map result = assertDoesNotThrow(() -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true)); - - for (String col : expectedCols) { - assertEquals(result.get(col), ip.getAs(ip.fieldIndex(col))); - } - } - }); + this.streamingTestUtil + .performDPLTest("index=index_A | iplocation otherIP allfields=false", this.testFile, ds -> { + LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); + + // GEO DB type, get db mapper + IplocationGeoIPDataMapper mapper = new IplocationGeoIPDataMapper( + mmdbPath, + this.streamingTestUtil.getCtx().nullValue, + extractMapFromHadoopCfg( + this.streamingTestUtil + .getCtx() + .getSparkSession() + .sparkContext() + .hadoopConfiguration() + ) + ); + + // run mapper on ip to assert expected + List ips = ds.select(ipColumn, expectedCols).collectAsList(); + for (Row ip : ips) { + Map result = assertDoesNotThrow( + () -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true) + ); + + for (String col : expectedCols) { + assertEquals(result.get(col), ip.getAs(ip.fieldIndex(col))); + } + } + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - @DisabledIfSystemProperty(named="skipGeoLiteTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + @DisabledIfSystemProperty( + named = "skipGeoLiteTest", + matches = "true" + ) public void iplocationTest_RirData_InvalidIPAddress_7() { String mmdbPath = "src/test/resources/rir-data.sample.mmdb"; String[] expectedCols = RIR_COLUMNS; String ipColumn = "otherIP"; this.streamingTestUtil.getCatalystVisitor().setIplocationMmdbPath(mmdbPath); - this.streamingTestUtil.performDPLTest("index=index_A | iplocation otherIP allfields=false", this.testFile, ds -> { - LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); - - // RIR DB type - IplocationRirDataMapper mapper = new IplocationRirDataMapper(mmdbPath, this.streamingTestUtil.getCtx().nullValue, - extractMapFromHadoopCfg(this.streamingTestUtil.getCtx().getSparkSession().sparkContext().hadoopConfiguration())); - - // run mapper on ip to assert expected - List ips = ds.select(ipColumn, expectedCols).collectAsList(); - for (Row ip : ips) { - Map result = assertDoesNotThrow(() -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true)); - - for (String col : expectedCols) { - String expected = result.get(col); - assertEquals(expected, ip.getAs(ip.fieldIndex(col))); - } - } - }); + this.streamingTestUtil + .performDPLTest("index=index_A | iplocation otherIP allfields=false", this.testFile, ds -> { + LOGGER.info("Consumer dataset's schema is <{}>", ds.schema()); + + // RIR DB type + IplocationRirDataMapper mapper = new IplocationRirDataMapper( + mmdbPath, + this.streamingTestUtil.getCtx().nullValue, + extractMapFromHadoopCfg( + this.streamingTestUtil + .getCtx() + .getSparkSession() + .sparkContext() + .hadoopConfiguration() + ) + ); + + // run mapper on ip to assert expected + List ips = ds.select(ipColumn, expectedCols).collectAsList(); + for (Row ip : ips) { + Map result = assertDoesNotThrow( + () -> mapper.call(ip.getAs(ip.fieldIndex(ipColumn)), "en", true) + ); + + for (String col : expectedCols) { + String expected = result.get(col); + assertEquals(expected, ip.getAs(ip.fieldIndex(col))); + } + } + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void iplocationTest_InvalidMmdbPath_8() { String mmdbPath = "/tmp/this-path-is-invalid/fake.mmdb"; this.streamingTestUtil.getCatalystVisitor().setIplocationMmdbPath(mmdbPath); - StreamingQueryException sqe = this.streamingTestUtil.performThrowingDPLTest(StreamingQueryException.class, "index=index_A | iplocation allfields=true source",this.testFile, (ds) -> { - }); - - assertEquals("Caused by: java.lang.RuntimeException: Invalid database file path given for iplocation command.", - this.streamingTestUtil.getInternalCauseString(sqe.cause(), RuntimeException.class)); + StreamingQueryException sqe = this.streamingTestUtil + .performThrowingDPLTest( + StreamingQueryException.class, "index=index_A | iplocation allfields=true source", + this.testFile, (ds) -> { + } + ); + + assertEquals( + "Caused by: java.lang.RuntimeException: Invalid database file path given for iplocation command.", + this.streamingTestUtil.getInternalCauseString(sqe.cause(), RuntimeException.class) + ); } // ---------------------------------------- @@ -339,5 +456,3 @@ private Map extractMapFromHadoopCfg(Configuration hadoopCfg) { return hadoopCfgAsMap; } } - - diff --git a/src/test/java/com/teragrep/pth10/JoinTransformationTest.java b/src/test/java/com/teragrep/pth10/JoinTransformationTest.java index 2498925..41f9d01 100644 --- a/src/test/java/com/teragrep/pth10/JoinTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/JoinTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,219 +63,280 @@ import static org.junit.jupiter.api.Assertions.*; /** - * Tests for the new ProcessingStack implementation - * Uses streaming datasets + * Tests for the new ProcessingStack implementation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class JoinTransformationTest { - private static final Logger LOGGER = LoggerFactory.getLogger(JoinTransformationTest.class); - - private String testFile = "src/test/resources/joinTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); - - private StreamingTestUtil streamingTestUtil; - - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(this.testSchema); - this.streamingTestUtil.setEnv(); - } - - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } - - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } - - - // ---------------------------------------- - // Tests - // ---------------------------------------- - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void joinRightSideHdfsLoadTest() { - streamingTestUtil.performDPLTest( - "index=index_A earliest=-100y | eval a=12345 | teragrep exec hdfs save /tmp/join0 overwrite=true", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, a]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - Row r = ds.select("a").distinct().first(); - assertEquals("12345", r.getAs(0).toString()); - } - ); - this.streamingTestUtil.setUp(); // reset for another run - streamingTestUtil.performDPLTest( - "index=index_A earliest=-100y | join partition [ | teragrep exec hdfs load /tmp/join0 | where partition >= 0 ]", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - Row r = ds.select("R_a").distinct().first(); - assertEquals("12345", r.getAs(0).toString()); - } - ); - } - - // type=left max=3 - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void joinTypeLeftMax3Test() { - streamingTestUtil.performDPLTest( - "index=index_A | join type=left max=3 offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ] ", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - List listOfRows = ds.collectAsList(); - - // 3 rows should be not null, since only three subsearch matches are requested using max=3 - int notNulls = 0; - for (Row r : listOfRows) { - if (r.getAs("R_a") != null) { - notNulls++; - } - } - - assertEquals(3, notNulls, "subsearch limit 3, so 3 should be not null"); - } - ); - } - - // max=2 - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void joinMax2TypeImplicitTest() { - streamingTestUtil.performDPLTest( - "index=index_A | join max=2 offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - assertEquals(2, ds.count(), "Should return 2 rows"); - } - ); - } - - // max=0 overwrite=true - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void join0MaxOverwriteExplicitTest() { - streamingTestUtil.performDPLTest( - "index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream3\", \"2\") | join max=0 overwrite=true offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, a]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - assertEquals(10, ds.count(), "Should return 10 rows"); - - List listOfAColumn = ds.select("a").collectAsList(); - - for (Row r : listOfAColumn) { - String val = r.getString(0); - assertTrue(val != null, "All rows should have a valid value (non-null) !"); - } - } - ); - } - - // no params - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void joinNoExtraParamsTest() { - streamingTestUtil.performDPLTest( - "index=index_A | join offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - assertEquals(1, ds.count(), "Should return 1 row"); - } - ); - } - - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void joinNoExtraCommandsOnMainSearchTest() { - streamingTestUtil.performDPLTest( - "index=index_A | join max=0 overwrite=true offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - List listOfRows = ds.collectAsList(); - assertEquals(10, listOfRows.size(), "Should return 10 rows, instead returned: " + listOfRows.size()); - } - ); - } - - // max=0, usetime=true, earlier=true - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void joinMax0UsetimeTrueEarlierTrueTest() { - streamingTestUtil.performDPLTest( - "index=index_A | join max=0 usetime=true earlier=true offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - assertEquals(10, ds.count(), "Should return 10 rows"); - } - ); - } - - // max=0, usetime=true, earlier=false, overwrite=false - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void joinMax0UsetimeTrueEarlierFalseOverwriteFalseTest() { - streamingTestUtil.performDPLTest( - "index=index_A | join max=0 usetime=true earlier=false overwrite=false offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, R__time, R_id, R__raw, R_index, R_sourcetype, R_host, R_source, R_partition, R_a]", - Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - List listOfRows = ds.collectAsList(); - assertEquals(10, listOfRows.size(), "Should return 10 rows, instead returned: " + listOfRows.size()); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void joinInvalidRightSideTest() { - RuntimeException rte = this.streamingTestUtil.performThrowingDPLTest(RuntimeException.class, - "| makeresults count=1 | eval a=1 | join a [search]" , testFile, ds -> {}); - - Assertions.assertEquals("Join command encountered an error: Subsearch dataset (right side) missing expected field 'a'", rte.getMessage()); - } -} \ No newline at end of file + + private static final Logger LOGGER = LoggerFactory.getLogger(JoinTransformationTest.class); + + private String testFile = "src/test/resources/joinTransformationTest_data*.json"; // * to make the path into a directory path + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); + + private StreamingTestUtil streamingTestUtil; + + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(this.testSchema); + this.streamingTestUtil.setEnv(); + } + + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } + + // ---------------------------------------- + // Tests + // ---------------------------------------- + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void joinRightSideHdfsLoadTest() { + streamingTestUtil + .performDPLTest( + "index=index_A earliest=-100y | eval a=12345 | teragrep exec hdfs save /tmp/join0 overwrite=true", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, a]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + Row r = ds.select("a").distinct().first(); + assertEquals("12345", r.getAs(0).toString()); + } + ); + this.streamingTestUtil.setUp(); // reset for another run + streamingTestUtil + .performDPLTest( + "index=index_A earliest=-100y | join partition [ | teragrep exec hdfs load /tmp/join0 | where partition >= 0 ]", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + Row r = ds.select("R_a").distinct().first(); + assertEquals("12345", r.getAs(0).toString()); + } + ); + } + + // type=left max=3 + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void joinTypeLeftMax3Test() { + streamingTestUtil + .performDPLTest( + "index=index_A | join type=left max=3 offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ] ", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + List listOfRows = ds.collectAsList(); + + // 3 rows should be not null, since only three subsearch matches are requested using max=3 + int notNulls = 0; + for (Row r : listOfRows) { + if (r.getAs("R_a") != null) { + notNulls++; + } + } + + assertEquals(3, notNulls, "subsearch limit 3, so 3 should be not null"); + } + ); + } + + // max=2 + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void joinMax2TypeImplicitTest() { + streamingTestUtil + .performDPLTest( + "index=index_A | join max=2 offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + assertEquals(2, ds.count(), "Should return 2 rows"); + } + ); + } + + // max=0 overwrite=true + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void join0MaxOverwriteExplicitTest() { + streamingTestUtil + .performDPLTest( + "index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream3\", \"2\") | join max=0 overwrite=true offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, a]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + assertEquals(10, ds.count(), "Should return 10 rows"); + + List listOfAColumn = ds.select("a").collectAsList(); + + for (Row r : listOfAColumn) { + String val = r.getString(0); + assertTrue(val != null, "All rows should have a valid value (non-null) !"); + } + } + ); + } + + // no params + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void joinNoExtraParamsTest() { + streamingTestUtil + .performDPLTest( + "index=index_A | join offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + assertEquals(1, ds.count(), "Should return 1 row"); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void joinNoExtraCommandsOnMainSearchTest() { + streamingTestUtil + .performDPLTest( + "index=index_A | join max=0 overwrite=true offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + List listOfRows = ds.collectAsList(); + assertEquals( + 10, listOfRows.size(), + "Should return 10 rows, instead returned: " + listOfRows.size() + ); + } + ); + } + + // max=0, usetime=true, earlier=true + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void joinMax0UsetimeTrueEarlierTrueTest() { + streamingTestUtil + .performDPLTest( + "index=index_A | join max=0 usetime=true earlier=true offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, R_a]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + assertEquals(10, ds.count(), "Should return 10 rows"); + } + ); + } + + // max=0, usetime=true, earlier=false, overwrite=false + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void joinMax0UsetimeTrueEarlierFalseOverwriteFalseTest() { + streamingTestUtil + .performDPLTest( + "index=index_A | join max=0 usetime=true earlier=false overwrite=false offset [ search index=index_A | eval a=case(sourcetype=\"stream1\", \"1\", sourcetype=\"stream2\", \"2\") ]", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, R__time, R_id, R__raw, R_index, R_sourcetype, R_host, R_source, R_partition, R_a]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + + List listOfRows = ds.collectAsList(); + assertEquals( + 10, listOfRows.size(), + "Should return 10 rows, instead returned: " + listOfRows.size() + ); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void joinInvalidRightSideTest() { + RuntimeException rte = this.streamingTestUtil + .performThrowingDPLTest( + RuntimeException.class, "| makeresults count=1 | eval a=1 | join a [search]", testFile, ds -> { + } + ); + + Assertions + .assertEquals( + "Join command encountered an error: Subsearch dataset (right side) missing expected field 'a'", + rte.getMessage() + ); + } +} diff --git a/src/test/java/com/teragrep/pth10/MakeresultsTransformationTest.java b/src/test/java/com/teragrep/pth10/MakeresultsTransformationTest.java index 09c47b9..e2c94a4 100644 --- a/src/test/java/com/teragrep/pth10/MakeresultsTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/MakeresultsTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -60,11 +60,12 @@ /** * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class MakeresultsTransformationTest { + private StreamingTestUtil streamingTestUtil; + @org.junit.jupiter.api.BeforeAll void setEnv() { this.streamingTestUtil = new StreamingTestUtil(); @@ -81,87 +82,81 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void makeresults_BasicQuery_Test() { - this.streamingTestUtil.performDPLTest( - "| makeresults", "", - ds -> { - assertEquals(new StructType(new StructField[]{ + this.streamingTestUtil.performDPLTest("| makeresults", "", ds -> { + assertEquals(new StructType(new StructField[] { new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()) - }), ds.schema()); - assertEquals(1, ds.count()); - } - ); + }), ds.schema()); + assertEquals(1, ds.count()); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void makeresults_Annotate_Test() { - this.streamingTestUtil.performDPLTest( - "| makeresults annotate=true", "", - ds -> { - assertEquals(new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("struck_server", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("struck_server_group", DataTypes.StringType, true, new MetadataBuilder().build()) - } - ),ds.schema()); - assertEquals(1, ds.count()); + this.streamingTestUtil.performDPLTest("| makeresults annotate=true", "", ds -> { + assertEquals(new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("struck_server", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("struck_server_group", DataTypes.StringType, true, new MetadataBuilder().build()) + }), ds.schema()); + assertEquals(1, ds.count()); - // get all rows except '_time' - List rows = ds.drop("_time").collectAsList(); - assertEquals(1, rows.size()); - // assert all of them to be null - rows.forEach(row -> { - assertEquals(6, row.length()); - for (int i = 0; i < row.length(); i++) { - assertEquals(this.streamingTestUtil.getCtx().nullValue.value(), row.get(i)); - } - }); - } - ); + // get all rows except '_time' + List rows = ds.drop("_time").collectAsList(); + assertEquals(1, rows.size()); + // assert all of them to be null + rows.forEach(row -> { + assertEquals(6, row.length()); + for (int i = 0; i < row.length(); i++) { + assertEquals(this.streamingTestUtil.getCtx().nullValue.value(), row.get(i)); + } + }); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void makeresults_Count100_Test() { - this.streamingTestUtil.performDPLTest( - "| makeresults count=100","", - ds -> { - assertEquals(new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - } - ), ds.schema()); - assertEquals(100, ds.count()); - } - ); + this.streamingTestUtil.performDPLTest("| makeresults count=100", "", ds -> { + assertEquals(new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + }), ds.schema()); + assertEquals(100, ds.count()); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void makeresults_WithEval_Test() { - this.streamingTestUtil.performDPLTest( - "| makeresults | eval a = 1", "", - ds -> { - assertEquals(new StructType(new StructField[]{ + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void makeresults_WithEval_Test() { + this.streamingTestUtil.performDPLTest("| makeresults | eval a = 1", "", ds -> { + assertEquals(new StructType(new StructField[] { new StructField("_time", DataTypes.TimestampType, true, new MetadataBuilder().build()), new StructField("a", DataTypes.IntegerType, false, new MetadataBuilder().build()) - }), ds.schema()); - assertEquals(1, ds.count()); - } - ); + }), ds.schema()); + assertEquals(1, ds.count()); + }); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/PredictTransformationTest.java b/src/test/java/com/teragrep/pth10/PredictTransformationTest.java index f083dc2..4bacc48 100644 --- a/src/test/java/com/teragrep/pth10/PredictTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/PredictTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10; import org.apache.spark.sql.*; @@ -65,22 +64,21 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class PredictTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(PredictTransformationTest.class); private final String testFile = "src/test/resources/predictTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -100,7 +98,6 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @@ -108,50 +105,80 @@ void tearDown() { // TODO Implement tests // FIXME: parser issues with upperXX=field / lowerXX=field (requires spaces) @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void predictTest_OneHourSpan_OnePredColumn() { - streamingTestUtil.performDPLTest( - "index=* | timechart span=1h avg(offset) as avgo | predict avgo AS pred upper 98 = u98 lower 98 = l98", - testFile, - ds -> { - assertEquals(Arrays.asList("_time", "avgo", "pred", "u98(pred)", "l98(pred)"), Arrays.asList(ds.schema().fieldNames())); - - // future_timespan=5 -> five nulls - List lr = ds.select("avgo").collectAsList().stream().filter(r -> r.get(0) == null).collect(Collectors.toList()); - assertEquals(5, lr.size()); - } - ); + streamingTestUtil + .performDPLTest( + "index=* | timechart span=1h avg(offset) as avgo | predict avgo AS pred upper 98 = u98 lower 98 = l98", + testFile, ds -> { + assertEquals( + Arrays.asList("_time", "avgo", "pred", "u98(pred)", "l98(pred)"), Arrays.asList(ds.schema().fieldNames()) + ); + + // future_timespan=5 -> five nulls + List lr = ds + .select("avgo") + .collectAsList() + .stream() + .filter(r -> r.get(0) == null) + .collect(Collectors.toList()); + assertEquals(5, lr.size()); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void predictTest_OneHourSpan_FutureTimeSpan() { - streamingTestUtil.performDPLTest( - "index=* | timechart span=1h avg(offset) as avgo | predict avgo AS pred future_timespan=10", - testFile, - ds -> { - assertEquals(Arrays.asList("_time", "avgo", "pred", "upper95(pred)", "lower95(pred)"), Arrays.asList(ds.schema().fieldNames())); - - // future_timespan=10 -> ten nulls - List lr = ds.select("avgo").collectAsList().stream().filter(r -> r.get(0) == null).collect(Collectors.toList()); - assertEquals(10, lr.size()); - } - ); + streamingTestUtil + .performDPLTest( + "index=* | timechart span=1h avg(offset) as avgo | predict avgo AS pred future_timespan=10", + testFile, ds -> { + assertEquals( + Arrays.asList("_time", "avgo", "pred", "upper95(pred)", "lower95(pred)"), Arrays.asList(ds.schema().fieldNames()) + ); + + // future_timespan=10 -> ten nulls + List lr = ds + .select("avgo") + .collectAsList() + .stream() + .filter(r -> r.get(0) == null) + .collect(Collectors.toList()); + assertEquals(10, lr.size()); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void predictTest_OneHourSpan_LLT() { - streamingTestUtil.performDPLTest( - "index=* | timechart span=1h avg(offset) as avgo | predict avgo AS pred algorithm=LLT future_timespan=10 ", - testFile, - ds -> { - assertEquals(Arrays.asList("_time", "avgo", "pred", "upper95(pred)", "lower95(pred)"), Arrays.asList(ds.schema().fieldNames())); - - // future_timespan=10 -> ten nulls - List lr = ds.select("avgo").collectAsList().stream().filter(r -> r.get(0) == null).collect(Collectors.toList()); - assertEquals(10, lr.size()); - } - ); + streamingTestUtil + .performDPLTest( + "index=* | timechart span=1h avg(offset) as avgo | predict avgo AS pred algorithm=LLT future_timespan=10 ", + testFile, ds -> { + assertEquals( + Arrays.asList("_time", "avgo", "pred", "upper95(pred)", "lower95(pred)"), Arrays.asList(ds.schema().fieldNames()) + ); + + // future_timespan=10 -> ten nulls + List lr = ds + .select("avgo") + .collectAsList() + .stream() + .filter(r -> r.get(0) == null) + .collect(Collectors.toList()); + assertEquals(10, lr.size()); + } + ); } } diff --git a/src/test/java/com/teragrep/pth10/RangemapTransformationTest.java b/src/test/java/com/teragrep/pth10/RangemapTransformationTest.java index a905592..3323746 100644 --- a/src/test/java/com/teragrep/pth10/RangemapTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/RangemapTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,21 +63,20 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class RangemapTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(RangemapTransformationTest.class); private final String testFile = "src/test/resources/numberData_0*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -102,105 +101,117 @@ void tearDown() { // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rangemapTest0() { - streamingTestUtil.performDPLTest( - "index=* | rangemap field=_raw", - testFile, - ds -> { - List result = ds.select("range").distinct().collectAsList(); - assertEquals(1, result.size()); - assertEquals("None", result.get(0).getList(0).get(0)); - } - ); + streamingTestUtil.performDPLTest("index=* | rangemap field=_raw", testFile, ds -> { + List result = ds.select("range").distinct().collectAsList(); + assertEquals(1, result.size()); + assertEquals("None", result.get(0).getList(0).get(0)); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rangemapTest1() { - streamingTestUtil.performDPLTest( - "index=* | rangemap field=_raw default=xyz", - testFile, - ds -> { - List result = ds.select("range").distinct().collectAsList(); - assertEquals(1, result.size()); - assertEquals("xyz", result.get(0).getList(0).get(0)); - } - ); + streamingTestUtil.performDPLTest("index=* | rangemap field=_raw default=xyz", testFile, ds -> { + List result = ds.select("range").distinct().collectAsList(); + assertEquals(1, result.size()); + assertEquals("xyz", result.get(0).getList(0).get(0)); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rangemapTest2() { - streamingTestUtil.performDPLTest( - "index=* | rangemap field=_raw lo=0-5 med=6-34 hi=35-48 vlo=-20--10", - testFile, - ds -> { - List result = ds.select("_raw", "range").collectAsList(); - assertEquals(5, result.size()); - result.forEach(r -> { - double val = Double.parseDouble(r.getAs(0).toString()); - if (val == 35d) { - assertEquals("hi", r.getList(1).get(0)); - } else if (val == 10d) { - assertEquals("med", r.getList(1).get(0)); - } else if (val == -10d) { - assertEquals("vlo", r.getList(1).get(0)); - } else if (val == 0d) { - assertEquals("lo", r.getList(1).get(0)); - } else if (val == 47.2d) { - assertEquals("hi", r.getList(1).get(0)); - } else { - fail("Unexpected _raw value: " + val); - } + streamingTestUtil + .performDPLTest("index=* | rangemap field=_raw lo=0-5 med=6-34 hi=35-48 vlo=-20--10", testFile, ds -> { + List result = ds.select("_raw", "range").collectAsList(); + assertEquals(5, result.size()); + result.forEach(r -> { + double val = Double.parseDouble(r.getAs(0).toString()); + if (val == 35d) { + assertEquals("hi", r.getList(1).get(0)); + } + else if (val == 10d) { + assertEquals("med", r.getList(1).get(0)); + } + else if (val == -10d) { + assertEquals("vlo", r.getList(1).get(0)); + } + else if (val == 0d) { + assertEquals("lo", r.getList(1).get(0)); + } + else if (val == 47.2d) { + assertEquals("hi", r.getList(1).get(0)); + } + else { + fail("Unexpected _raw value: " + val); + } + }); }); - } - ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rangemapTest3() { - IllegalArgumentException iae = this.streamingTestUtil.performThrowingDPLTest(IllegalArgumentException.class, "index=* | rangemap", testFile, ds -> {}); + IllegalArgumentException iae = this.streamingTestUtil + .performThrowingDPLTest(IllegalArgumentException.class, "index=* | rangemap", testFile, ds -> { + }); assertEquals("Field parameter is required!", iae.getMessage()); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rangemapTest4() { - streamingTestUtil.performDPLTest( - "| makeresults | eval _raw = \"string\" | rangemap field=_raw r0=0-10 r1=11-20", - testFile, - ds -> { - // strings result in default value - List result = ds.select("range").distinct().collectAsList(); - assertEquals(1, result.size()); - assertEquals("None", result.get(0).getList(0).get(0)); - } - ); + streamingTestUtil + .performDPLTest( + "| makeresults | eval _raw = \"string\" | rangemap field=_raw r0=0-10 r1=11-20", testFile, + ds -> { + // strings result in default value + List result = ds.select("range").distinct().collectAsList(); + assertEquals(1, result.size()); + assertEquals("None", result.get(0).getList(0).get(0)); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rangemapMultiValueTest() { - streamingTestUtil.performDPLTest( - "index=* | eval a = mvappend(\"1\",\"3\",\"3\",\"a\") |rangemap field=a lo=1-2 hi=3-4", - testFile, - ds -> { - List result = ds.select("range").distinct().collectAsList(); - assertEquals(1, result.size()); - List resultList = result.get(0).getList(0); - assertEquals(2, resultList.size()); - List expected = Arrays.asList("lo", "hi"); - - for (String res : resultList) { - if (!expected.contains(res)) { - fail("Expected values did not contain result value: " + res); - } - } - } - ); + streamingTestUtil + .performDPLTest( + "index=* | eval a = mvappend(\"1\",\"3\",\"3\",\"a\") |rangemap field=a lo=1-2 hi=3-4", + testFile, ds -> { + List result = ds.select("range").distinct().collectAsList(); + assertEquals(1, result.size()); + List resultList = result.get(0).getList(0); + assertEquals(2, resultList.size()); + List expected = Arrays.asList("lo", "hi"); + + for (String res : resultList) { + if (!expected.contains(res)) { + fail("Expected values did not contain result value: " + res); + } + } + } + ); } } - - diff --git a/src/test/java/com/teragrep/pth10/RegexTransformationTest.java b/src/test/java/com/teragrep/pth10/RegexTransformationTest.java index 04a6ff0..eff126e 100644 --- a/src/test/java/com/teragrep/pth10/RegexTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/RegexTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -59,29 +59,27 @@ import static org.junit.jupiter.api.Assertions.assertTrue; /** - * Tests for RegexTransformation - * Uses streaming datasets + * Tests for RegexTransformation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class RegexTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(RegexTransformationTest.class); private final String testFile = "src/test/resources/regexTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -101,71 +99,65 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void regexTest1() { - streamingTestUtil.performDPLTest( - "index=index_A | regex _raw != \"data data\"", - testFile, - ds -> { - assertEquals(0, ds.collectAsList().size()); - } - ); + streamingTestUtil.performDPLTest("index=index_A | regex _raw != \"data data\"", testFile, ds -> { + assertEquals(0, ds.collectAsList().size()); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void regexTest2() { - streamingTestUtil.performDPLTest( - "index=index_A | regex _raw = \"data data\"", - testFile, - ds -> { - int size = ds.collectAsList().size(); - assertTrue(size > 1); - } - ); + streamingTestUtil.performDPLTest("index=index_A | regex _raw = \"data data\"", testFile, ds -> { + int size = ds.collectAsList().size(); + assertTrue(size > 1); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void regexTest3() { - streamingTestUtil.performDPLTest( - "index=index_A | regex \"data data\"", - testFile, - ds -> { - int size = ds.collectAsList().size(); - assertTrue(size > 1); - } - ); + streamingTestUtil.performDPLTest("index=index_A | regex \"data data\"", testFile, ds -> { + int size = ds.collectAsList().size(); + assertTrue(size > 1); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void regexTest4() { - streamingTestUtil.performDPLTest( - "index=index_A | regex \"^[d|D][a|z][t|T][a|B]\\s.{4}$\"", - testFile, - ds -> { - int size = ds.collectAsList().size(); - assertTrue(size > 1); - } - ); + streamingTestUtil.performDPLTest("index=index_A | regex \"^[d|D][a|z][t|T][a|B]\\s.{4}$\"", testFile, ds -> { + int size = ds.collectAsList().size(); + assertTrue(size > 1); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void regexTest5() { - streamingTestUtil.performDPLTest( - "index=index_A | regex \"^[d|D][a|z][t|T][c|B]\\s.{4}$\"", - testFile, - ds -> { - assertEquals(0, ds.collectAsList().size()); - } - ); + streamingTestUtil.performDPLTest("index=index_A | regex \"^[d|D][a|z][t|T][c|B]\\s.{4}$\"", testFile, ds -> { + assertEquals(0, ds.collectAsList().size()); + }); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/RenameTransformationTest.java b/src/test/java/com/teragrep/pth10/RenameTransformationTest.java index c30382a..d6b0aa4 100644 --- a/src/test/java/com/teragrep/pth10/RenameTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/RenameTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -60,29 +60,27 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** - * Tests for the RenameTransformationTest implementation - * Uses streaming datasets + * Tests for the RenameTransformationTest implementation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class RenameTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(RenameTransformationTest.class); private final String testFile = "src/test/resources/regexTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -102,22 +100,26 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rename_test_1() { - streamingTestUtil.performDPLTest( - "index=index_A | rename _raw AS DATA , offset AS number, sourcetype AS typeOfSource, INVALID_FIELD AS fieldOfInvalid", - testFile, - ds -> { - assertEquals("[_time, id, DATA, index, typeOfSource, host, source, partition, number]", - Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | rename _raw AS DATA , offset AS number, sourcetype AS typeOfSource, INVALID_FIELD AS fieldOfInvalid", + testFile, ds -> { + assertEquals( + "[_time, id, DATA, index, typeOfSource, host, source, partition, number]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/ReplaceTransformationTest.java b/src/test/java/com/teragrep/pth10/ReplaceTransformationTest.java index 09073c0..addd41f 100644 --- a/src/test/java/com/teragrep/pth10/ReplaceTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/ReplaceTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,29 +61,27 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** - * Tests for the ReplaceTransformation implementation - * Uses streaming datasets + * Tests for the ReplaceTransformation implementation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class ReplaceTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(ReplaceTransformationTest.class); private final String testFile = "src/test/resources/replaceTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -103,84 +101,123 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // Standard replace, without wildcards in WITH-clause + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // Standard replace, without wildcards in WITH-clause public void replace_test_1() { - streamingTestUtil.performDPLTest( - "index=index_A | replace \"?$.data*\" WITH \"SomethingNew\" IN _raw", - testFile, - ds -> { - List listOfRawCol = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(1, listOfRawCol.size()); - assertEquals("SomethingNew", listOfRawCol.get(0)); - } - ); + streamingTestUtil + .performDPLTest("index=index_A | replace \"?$.data*\" WITH \"SomethingNew\" IN _raw", testFile, ds -> { + List listOfRawCol = ds + .select("_raw") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(1, listOfRawCol.size()); + assertEquals("SomethingNew", listOfRawCol.get(0)); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // One trailing wildcard in WITH-clause + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // One trailing wildcard in WITH-clause public void replace_test_2() { - streamingTestUtil.performDPLTest( - "index=index_A | replace \"?$.data*\" WITH \"SomethingNew*\" IN _raw", - testFile, - ds -> { - List listOfRawCol = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(1, listOfRawCol.size()); - assertEquals("SomethingNew^){", listOfRawCol.get(0)); - } - ); + streamingTestUtil + .performDPLTest("index=index_A | replace \"?$.data*\" WITH \"SomethingNew*\" IN _raw", testFile, ds -> { + List listOfRawCol = ds + .select("_raw") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(1, listOfRawCol.size()); + assertEquals("SomethingNew^){", listOfRawCol.get(0)); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // One wildcard in WITH-clause as a prefix + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // One wildcard in WITH-clause as a prefix public void replace_test_3() { - streamingTestUtil.performDPLTest( - "index=index_A | replace \"*data^){\" WITH \"SomethingNew*\" IN _raw", - testFile, - ds -> { - List listOfRawCol = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(1, listOfRawCol.size()); - assertEquals("SomethingNew?$.", listOfRawCol.get(0)); - } - ); + streamingTestUtil + .performDPLTest("index=index_A | replace \"*data^){\" WITH \"SomethingNew*\" IN _raw", testFile, ds -> { + List listOfRawCol = ds + .select("_raw") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(1, listOfRawCol.size()); + assertEquals("SomethingNew?$.", listOfRawCol.get(0)); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // Two wildcards, both as a prefix and trailing in WITH-clause + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // Two wildcards, both as a prefix and trailing in WITH-clause public void replace_test_4() { - streamingTestUtil.performDPLTest( - "index=index_A | replace \"*data*\" WITH \"*SomethingNew*\" IN _raw", - testFile, - ds -> { - List listOfRawCol = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(1, listOfRawCol.size()); - assertEquals("?$.SomethingNew^){", listOfRawCol.get(0)); - } - ); + streamingTestUtil + .performDPLTest("index=index_A | replace \"*data*\" WITH \"*SomethingNew*\" IN _raw", testFile, ds -> { + List listOfRawCol = ds + .select("_raw") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(1, listOfRawCol.size()); + assertEquals("?$.SomethingNew^){", listOfRawCol.get(0)); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // Two x WITH y constructs + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // Two x WITH y constructs public void replaceTwoValuesTest() { - streamingTestUtil.performDPLTest( - "index=index_A | replace host WITH lost, index_A WITH index_B IN host, index", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset]", - Arrays.toString(ds.columns())); - - List listOfHost = ds.select("host").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(1, listOfHost.size()); - assertEquals("lost", listOfHost.get(0)); - - List listOfIndex = ds.select("index").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(1, listOfIndex.size()); - assertEquals("index_B", listOfIndex.get(0)); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | replace host WITH lost, index_A WITH index_B IN host, index", testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset]", + Arrays.toString(ds.columns()) + ); + + List listOfHost = ds + .select("host") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(1, listOfHost.size()); + assertEquals("lost", listOfHost.get(0)); + + List listOfIndex = ds + .select("index") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(1, listOfIndex.size()); + assertEquals("index_B", listOfIndex.get(0)); + } + ); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/Rex4jTransformationTest.java b/src/test/java/com/teragrep/pth10/Rex4jTransformationTest.java index 9341836..20bf4ea 100644 --- a/src/test/java/com/teragrep/pth10/Rex4jTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/Rex4jTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,22 +63,21 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class Rex4jTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(Rex4jTransformationTest.class); private final String testFile = "src/test/resources/rex4jTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -103,112 +102,144 @@ void tearDown() { // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rex4jTest_ExtractionModeMultipleCaptureGroups() { - streamingTestUtil.performDPLTest( - "index=index_A | rex4j \".*rainfall_rate\\\":\\s(?\\d+.\\d+).*wind_speed\\\":\\s(?\\d+.\\d+).*latitude\\\":\\s(?-?\\d+.\\d+)\"", - testFile, - ds -> { - // get extracted column data - List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List windSpeed = ds.select("windSPEDE").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List latitude = ds.select("latiTUDE").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, latitude.size()); - assertEquals(1, windSpeed.size()); - assertEquals(1, rainfallRate.size()); - - // check values - assertEquals("25.5", rainfallRate.get(0)); - assertEquals("51.0", windSpeed.get(0)); - assertEquals("-89.625", latitude.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex4j \".*rainfall_rate\\\":\\s(?\\d+.\\d+).*wind_speed\\\":\\s(?\\d+.\\d+).*latitude\\\":\\s(?-?\\d+.\\d+)\"", + testFile, ds -> { + // get extracted column data + List rainfallRate = ds + .select("rainFALL") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List windSpeed = ds + .select("windSPEDE") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List latitude = ds + .select("latiTUDE") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, latitude.size()); + assertEquals(1, windSpeed.size()); + assertEquals(1, rainfallRate.size()); + + // check values + assertEquals("25.5", rainfallRate.get(0)); + assertEquals("51.0", windSpeed.get(0)); + assertEquals("-89.625", latitude.get(0)); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rex4jTest_ExtractionModeSingleCaptureGroup() { - streamingTestUtil.performDPLTest( - "index=index_A | rex4j \".*rainfall_rate\\\":\\s(?\\d+.\\d+)\"", - testFile, - ds -> { - // get extracted column data - List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rainfallRate.size()); - - // check values - assertEquals("25.5", rainfallRate.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex4j \".*rainfall_rate\\\":\\s(?\\d+.\\d+)\"", testFile, ds -> { + // get extracted column data + List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rainfallRate.size()); + + // check values + assertEquals("25.5", rainfallRate.get(0)); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rex4jTest_ExtractionMode_EmptyResults() { - streamingTestUtil.performDPLTest( - "index=index_A | rex4j \"(?)\"", - testFile, - ds -> { - // get extracted column data - List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0)).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rainfallRate.size()); - - // check values - assertEquals(streamingTestUtil.getCtx().nullValue.value(), rainfallRate.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex4j \"(?)\"", testFile, ds -> { + // get extracted column data + List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0)).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rainfallRate.size()); + + // check values + assertEquals(streamingTestUtil.getCtx().nullValue.value(), rainfallRate.get(0)); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rex4jTest_SedMode() { - streamingTestUtil.performDPLTest( - "index=index_A | rex4j mode=sed \"s/rainfall_rate/meltdown_rate/g\"", - testFile, - ds -> { - // get extracted column data - List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rawData.size()); - - // check values - assertEquals("{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"longitude\": 139.875}", - rawData.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex4j mode=sed \"s/rainfall_rate/meltdown_rate/g\"", testFile, ds -> { + // get extracted column data + List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rawData.size()); + + // check values + assertEquals( + "{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"longitude\": 139.875}", + rawData.get(0) + ); + } + ); } @Test @Disabled(value = "Needs Spark 3.x to work") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rex4jTest_ExtractNestedCaptureGroup_Issue391() { // FIXME: Rex4j needs regexp_extract_all support introduced in Spark 3.x // Workaround is to use rex command instead - streamingTestUtil.performDPLTest( - "| makeresults count=1 |" + - " eval _raw=\"RandomStuff name\\=john [INFO]notification(abc): " + - "start\\=\\\"2022-01-01 11:11:11\\\" length\\=22 id\\=123 service\\=tcp/port:9999 " + - "type\\=1 res zone\\=main spw zone\\=lan action\\=ok transmit\\=10 pswd\\=0 " + - "src\\=192.168.4.2 dst\\=192.168.0.15 res_port\\=9999 spw_port\\=9998 res-xl ip\\=192.168.4.2 " + - "port\\=99999 spw-xl ip\\=192.168.0.15 port\\=99998 sess_id\\=12345 reason\\=stop - TIME OUT\" " + - "| rex4j \"id=(?\\d+) service=(?(tcp|udp)\\/port:\\d+) type=\\d+ res zone=(?.*?) spw zone=(?.*?) action=(?.*?) transmit=(?\\d+) pswd=(?\\d+) src=(?\\d+\\.\\d+\\.\\d+\\.\\d+) dst=(?\\d+\\.\\d+\\.\\d+\\.\\d+)( res_port=\\d+ res_port=(?\\d+)(.*?))? sess_id=(?\\d+) reason=(?.*?)$\" | fields - _raw", - testFile, - ds -> { - // get extracted column data - List rawData = ds.select("resZone") - .dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rawData.size()); - - // check values - assertEquals("main", - rawData.get(0)); - }); + streamingTestUtil + .performDPLTest( + "| makeresults count=1 |" + " eval _raw=\"RandomStuff name\\=john [INFO]notification(abc): " + + "start\\=\\\"2022-01-01 11:11:11\\\" length\\=22 id\\=123 service\\=tcp/port:9999 " + + "type\\=1 res zone\\=main spw zone\\=lan action\\=ok transmit\\=10 pswd\\=0 " + + "src\\=192.168.4.2 dst\\=192.168.0.15 res_port\\=9999 spw_port\\=9998 res-xl ip\\=192.168.4.2 " + + "port\\=99999 spw-xl ip\\=192.168.0.15 port\\=99998 sess_id\\=12345 reason\\=stop - TIME OUT\" " + + "| rex4j \"id=(?\\d+) service=(?(tcp|udp)\\/port:\\d+) type=\\d+ res zone=(?.*?) spw zone=(?.*?) action=(?.*?) transmit=(?\\d+) pswd=(?\\d+) src=(?\\d+\\.\\d+\\.\\d+\\.\\d+) dst=(?\\d+\\.\\d+\\.\\d+\\.\\d+)( res_port=\\d+ res_port=(?\\d+)(.*?))? sess_id=(?\\d+) reason=(?.*?)$\" | fields - _raw", + testFile, ds -> { + // get extracted column data + List rawData = ds.select("resZone").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rawData.size()); + + // check values + assertEquals("main", rawData.get(0)); + } + ); } } - - diff --git a/src/test/java/com/teragrep/pth10/RexTransformationTest.java b/src/test/java/com/teragrep/pth10/RexTransformationTest.java index 7f2a41d..5fb2095 100644 --- a/src/test/java/com/teragrep/pth10/RexTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/RexTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -60,26 +60,24 @@ import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class RexTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(RexTransformationTest.class); private final String testFile = "src/test/resources/rexTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -104,200 +102,263 @@ void tearDown() { // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_ExtractionModeSingleCaptureGroup() { - streamingTestUtil.performDPLTest( - "index=index_A | rex \".*rainfall_rate\\\":\\s(?\\d+.\\d+)\"", - testFile, - ds -> { - // get extracted column data - List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rainfallRate.size()); - - // check values - assertEquals("139.875", rainfallRate.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex \".*rainfall_rate\\\":\\s(?\\d+.\\d+)\"", testFile, ds -> { + // get extracted column data + List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rainfallRate.size()); + + // check values + assertEquals("139.875", rainfallRate.get(0)); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_ExtractionModeSingleCaptureGroup_NoMatch() { - streamingTestUtil.performDPLTest( - "index=index_A | rex \".*rainfall_rate\\\":\\s(?(noMatch))\"", - testFile, - ds -> { - // get extracted column data - List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0)).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rainfallRate.size()); - - // check values: should be nullValue - assertEquals(streamingTestUtil.getCtx().nullValue.value(), rainfallRate.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex \".*rainfall_rate\\\":\\s(?(noMatch))\"", testFile, ds -> { + // get extracted column data + List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0)).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rainfallRate.size()); + + // check values: should be nullValue + assertEquals(streamingTestUtil.getCtx().nullValue.value(), rainfallRate.get(0)); + } + ); } // underscore in the capture group name @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_ExtractionModeSingleCaptureGroupWithUnderscore() { - streamingTestUtil.performDPLTest( - "index=index_A | rex \".*rainfall_rate\\\":\\s(?\\d+.\\d+)\"", - testFile, - ds -> { - // get extracted column data - List rainfallRate = ds.select("rain_FALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rainfallRate.size()); - - // check values - assertEquals("139.875", rainfallRate.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex \".*rainfall_rate\\\":\\s(?\\d+.\\d+)\"", testFile, ds -> { + // get extracted column data + List rainfallRate = ds.select("rain_FALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rainfallRate.size()); + + // check values + assertEquals("139.875", rainfallRate.get(0)); + } + ); } // underscore in the capture group name, rex in sequential mode @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_ExtractionModeSingleCaptureGroupWithUnderscore_sequentialMode() { // sort command is sequential only - streamingTestUtil.performDPLTest( - "index=index_A | sort offset | rex \".*rainfall_rate\\\":\\s(?\\d+.\\d+)\"", - testFile, - ds -> { - // get extracted column data - List rainfallRate = ds.select("rain_FALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rainfallRate.size()); - - // check values - assertEquals("139.875", rainfallRate.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | sort offset | rex \".*rainfall_rate\\\":\\s(?\\d+.\\d+)\"", + testFile, ds -> { + // get extracted column data + List rainfallRate = ds.select("rain_FALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rainfallRate.size()); + + // check values + assertEquals("139.875", rainfallRate.get(0)); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_ExtractionModeMultipleCaptureGroups() { - streamingTestUtil.performDPLTest( - "index=index_A | rex \".*rainfall_rate\\\":\\s(?\\d+.\\d+).*wind_speed\\\":\\s(?\\d+.\\d+).*latitude\\\":\\s(?-?\\d+.\\d+)\"", - testFile, - ds -> { - // get extracted column data - List rainfallRate = ds.select("rainFALL").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List windSpeed = ds.select("windSPEDE").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List latitude = ds.select("latiTUDE").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, latitude.size()); - assertEquals(1, windSpeed.size()); - assertEquals(1, rainfallRate.size()); - - // check values - assertEquals("25.5", rainfallRate.get(0)); - assertEquals("51.0", windSpeed.get(0)); - assertEquals("-89.625", latitude.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex \".*rainfall_rate\\\":\\s(?\\d+.\\d+).*wind_speed\\\":\\s(?\\d+.\\d+).*latitude\\\":\\s(?-?\\d+.\\d+)\"", + testFile, ds -> { + // get extracted column data + List rainfallRate = ds + .select("rainFALL") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List windSpeed = ds + .select("windSPEDE") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List latitude = ds + .select("latiTUDE") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, latitude.size()); + assertEquals(1, windSpeed.size()); + assertEquals(1, rainfallRate.size()); + + // check values + assertEquals("25.5", rainfallRate.get(0)); + assertEquals("51.0", windSpeed.get(0)); + assertEquals("-89.625", latitude.get(0)); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_SedMode() { - streamingTestUtil.performDPLTest( - "index=index_A | rex mode=sed \"s/rainfall_rate/meltdown_rate/1\"", - testFile, - ds -> { - // get extracted column data - List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rawData.size()); - - // check values - assertEquals("{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"rainfall_rate\": 139.875}", - rawData.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex mode=sed \"s/rainfall_rate/meltdown_rate/1\"", testFile, ds -> { + // get extracted column data + List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rawData.size()); + + // check values + assertEquals( + "{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"rainfall_rate\": 139.875}", + rawData.get(0) + ); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_SedMode2() { - streamingTestUtil.performDPLTest( - "index=index_A | rex mode=sed \"s/rainfall_rate/meltdown_rate/g\"", - testFile, - ds -> { - // get extracted column data - List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rawData.size()); - - // check values - assertEquals("{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"meltdown_rate\": 139.875}", - rawData.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex mode=sed \"s/rainfall_rate/meltdown_rate/g\"", testFile, ds -> { + // get extracted column data + List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rawData.size()); + + // check values + assertEquals( + "{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"meltdown_rate\": 139.875}", + rawData.get(0) + ); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_SedMode_CustomDelimiter() { - streamingTestUtil.performDPLTest( - "index=index_A | rex mode=sed \"s;rainfall_rate;meltdown_rate;g\"", - testFile, - ds -> { - // get extracted column data - List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rawData.size()); - - // check values - assertEquals("{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"meltdown_rate\": 139.875}", - rawData.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex mode=sed \"s;rainfall_rate;meltdown_rate;g\"", testFile, ds -> { + // get extracted column data + List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rawData.size()); + + // check values + assertEquals( + "{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"meltdown_rate\": 139.875}", + rawData.get(0) + ); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_SedMode_MissingSubstituteFlag() { - StreamingQueryException sqe = streamingTestUtil.performThrowingDPLTest(StreamingQueryException.class, - /* DPL Query = */ "index=index_A | rex mode=sed \";rainfall_rate;meltdown_rate;g\"", testFile, - ds -> { - // get extracted column data - List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rawData.size()); - - // check values - assertEquals("{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"meltdown_rate\": 139.875}", - rawData.get(0)); - }); - - assertEquals("Caused by: java.lang.IllegalStateException: Expected sed mode to be substitute! Other modes are not supported.", - streamingTestUtil.getInternalCauseString(sqe.cause(), IllegalStateException.class)); + StreamingQueryException sqe = streamingTestUtil + .performThrowingDPLTest( + StreamingQueryException.class, + /* DPL Query = */ "index=index_A | rex mode=sed \";rainfall_rate;meltdown_rate;g\"", testFile, + ds -> { + // get extracted column data + List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rawData.size()); + + // check values + assertEquals( + "{\"meltdown_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"meltdown_rate\": 139.875}", + rawData.get(0) + ); + } + ); + + assertEquals( + "Caused by: java.lang.IllegalStateException: Expected sed mode to be substitute! Other modes are not supported.", + streamingTestUtil.getInternalCauseString(sqe.cause(), IllegalStateException.class) + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void rexTest_SedMode3() { - streamingTestUtil.performDPLTest( - "index=index_A | rex mode=sed \"s/rainfall_rate/meltdown_rate/2g\"", - testFile, - ds -> { - // get extracted column data - List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // every value should be unique - assertEquals(1, rawData.size()); - - // check values - assertEquals("{\"rainfall_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"meltdown_rate\": 139.875}", - rawData.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | rex mode=sed \"s/rainfall_rate/meltdown_rate/2g\"", testFile, ds -> { + // get extracted column data + List rawData = ds.select("_raw").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + + // every value should be unique + assertEquals(1, rawData.size()); + + // check values + assertEquals( + "{\"rainfall_rate\": 25.5, \"wind_speed\": 51.0, \"atmosphere_water_vapor_content\": 76.5, \"atmosphere_cloud_liquid_water_content\": 2.5, \"latitude\": -89.625, \"meltdown_rate\": 139.875}", + rawData.get(0) + ); + } + ); } } - - diff --git a/src/test/java/com/teragrep/pth10/SearchTransformationTest.java b/src/test/java/com/teragrep/pth10/SearchTransformationTest.java index 2ce78e3..4ac650b 100644 --- a/src/test/java/com/teragrep/pth10/SearchTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/SearchTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -62,233 +62,350 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class SearchTransformationTest { - private static final Logger LOGGER = LoggerFactory.getLogger(SearchTransformationTest.class); - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); - - private StreamingTestUtil streamingTestUtil; - - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(this.testSchema); - this.streamingTestUtil.setEnv(); - } - - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } - - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } - - - // ---------------------------------------- - // Tests - // ---------------------------------------- - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchTest_FieldComparison() { - String query = "index=index_A | search sourcetype!=stream2"; - String testFile = "src/test/resources/joinTransformationTest_data*.json"; // * to make the path into a directory path - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List listOfResult = ds.select("sourcetype").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedValues = Arrays.asList("stream1", "stream1", "stream1", "stream1", "stream1"); - assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchTest_Boolean() { - String query = "index=index_A | search sourcetype=stream1 AND (id = 1 OR id = 3)"; - String testFile = "src/test/resources/joinTransformationTest_data*.json"; // * to make the path into a directory path - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List listOfResult = ds.select("sourcetype").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedValues = Arrays.asList("stream1", "stream1"); - assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchTest_TextSearchFromRaw() { - String query = "index=index_A | search \"nothing\""; - String testFile = "src/test/resources/joinTransformationTest_data*.json"; // * to make the path into a directory path - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List listOfResult = ds.select("sourcetype").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expectedValues = Collections.emptyList(); - assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); - }); - } - - - // Tests compareStatement after spath (spath makes all data into String) - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchAfterSpath_ComparisonTest() { - String query = "index=index_A | spath path= json | search json > 40"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List json = ds.select("json").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = new ArrayList<>(Arrays.asList("50", "60", "70", "80", "90", "100")); - - assertEquals(expected, json); - }); - } - - // Tests compareStatement after spath (spath makes all data into String) - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchAfterSpath_ComparisonTest2() { - String query = "index=index_A | spath path= json | search json <= 40"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List json = ds.select("json").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = new ArrayList<>(Arrays.asList("7", "8", "9", "40")); - - assertEquals(expected, json); - }); - } - - // Tests search with equals - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchComparisonEqTest() { - String query = "index=index_A | search sourcetype = stream1"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List sourcetype = ds.select("sourcetype").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("stream1", "stream1", "stream1", "stream1", "stream1"); - - assertEquals(expected, sourcetype); - }); - } - - // Tests search with equals and wildcard - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchComparisonEqWildcardTest() { - String query = "index=index_A | search sourcetype = stream*"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List sourcetype = ds.select("sourcetype").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("stream1", "stream2", "stream1", "stream2", "stream1", "stream2", "stream1", "stream2", "stream1", "stream2"); - - assertEquals(expected, sourcetype); - }); - } - - // Test search with not equals - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchComparisonNeqTest() { - String query = "index=index_A | search id != 10"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List id = ds.select("id").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("1", "2", "3", "4", "5", "6", "7","8", "9"); - - assertEquals(expected, id); - }); - } - - // Tests search with greater than - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchComparisonGtTest() { - String query = "index=index_A | search id > 9"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List id = ds.select("id").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Collections.singletonList("10"); - - assertEquals(expected, id); - }); - } - - // Tests search with greater than or equal to - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchComparisonGteTest() { - String query = "index=index_A | search id >= 9"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List id = ds.select("id").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("9", "10"); - - assertEquals(expected, id); - }); - } - - // Tests search with less than - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchComparisonLtTest() { - String query = "index=index_A | search id < 10"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List id = ds.select("id").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("1", "2", "3", "4", "5", "6", "7","8", "9"); - - assertEquals(expected, id); - }); - } - - // Tests search with less than or equal to - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchComparisonLteTest() { - String query = "index=index_A | search id <= 10"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List id = ds.select("id").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("1", "2", "3", "4", "5", "6", "7","8", "9", "10"); - - assertEquals(expected, id); - }); - } - - // Tests search compare with a string and a number. Should be a lexicographical comparison - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void searchComparisonMixedInputTest() { - String query = "index=index_A | search \"source\" < 2"; - String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; - - streamingTestUtil.performDPLTest(query, testFile, ds -> { - List json = ds.select("source").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("127.0.0.0", "127.1.1.1", "127.2.2.2", "127.3.3.3", "127.4.4.4", - "127.5.5.5", "127.6.6.6", "127.7.7.7", "127.8.8.8", "127.9.9.9"); - - assertEquals(expected, json); - }); - } + + private static final Logger LOGGER = LoggerFactory.getLogger(SearchTransformationTest.class); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); + + private StreamingTestUtil streamingTestUtil; + + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(this.testSchema); + this.streamingTestUtil.setEnv(); + } + + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } + + // ---------------------------------------- + // Tests + // ---------------------------------------- + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchTest_FieldComparison() { + String query = "index=index_A | search sourcetype!=stream2"; + String testFile = "src/test/resources/joinTransformationTest_data*.json"; // * to make the path into a directory path + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List listOfResult = ds + .select("sourcetype") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedValues = Arrays.asList("stream1", "stream1", "stream1", "stream1", "stream1"); + assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchTest_Boolean() { + String query = "index=index_A | search sourcetype=stream1 AND (id = 1 OR id = 3)"; + String testFile = "src/test/resources/joinTransformationTest_data*.json"; // * to make the path into a directory path + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List listOfResult = ds + .select("sourcetype") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedValues = Arrays.asList("stream1", "stream1"); + assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchTest_TextSearchFromRaw() { + String query = "index=index_A | search \"nothing\""; + String testFile = "src/test/resources/joinTransformationTest_data*.json"; // * to make the path into a directory path + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List listOfResult = ds + .select("sourcetype") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expectedValues = Collections.emptyList(); + assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); + }); + } + + // Tests compareStatement after spath (spath makes all data into String) + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchAfterSpath_ComparisonTest() { + String query = "index=index_A | spath path= json | search json > 40"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List json = ds + .select("json") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = new ArrayList<>(Arrays.asList("50", "60", "70", "80", "90", "100")); + + assertEquals(expected, json); + }); + } + + // Tests compareStatement after spath (spath makes all data into String) + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchAfterSpath_ComparisonTest2() { + String query = "index=index_A | spath path= json | search json <= 40"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List json = ds + .select("json") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = new ArrayList<>(Arrays.asList("7", "8", "9", "40")); + + assertEquals(expected, json); + }); + } + + // Tests search with equals + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchComparisonEqTest() { + String query = "index=index_A | search sourcetype = stream1"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List sourcetype = ds + .select("sourcetype") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays.asList("stream1", "stream1", "stream1", "stream1", "stream1"); + + assertEquals(expected, sourcetype); + }); + } + + // Tests search with equals and wildcard + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchComparisonEqWildcardTest() { + String query = "index=index_A | search sourcetype = stream*"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List sourcetype = ds + .select("sourcetype") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays + .asList( + "stream1", "stream2", "stream1", "stream2", "stream1", "stream2", "stream1", "stream2", + "stream1", "stream2" + ); + + assertEquals(expected, sourcetype); + }); + } + + // Test search with not equals + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchComparisonNeqTest() { + String query = "index=index_A | search id != 10"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List id = ds + .select("id") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9"); + + assertEquals(expected, id); + }); + } + + // Tests search with greater than + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchComparisonGtTest() { + String query = "index=index_A | search id > 9"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List id = ds + .select("id") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Collections.singletonList("10"); + + assertEquals(expected, id); + }); + } + + // Tests search with greater than or equal to + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchComparisonGteTest() { + String query = "index=index_A | search id >= 9"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List id = ds + .select("id") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays.asList("9", "10"); + + assertEquals(expected, id); + }); + } + + // Tests search with less than + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchComparisonLtTest() { + String query = "index=index_A | search id < 10"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List id = ds + .select("id") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9"); + + assertEquals(expected, id); + }); + } + + // Tests search with less than or equal to + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchComparisonLteTest() { + String query = "index=index_A | search id <= 10"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List id = ds + .select("id") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"); + + assertEquals(expected, id); + }); + } + + // Tests search compare with a string and a number. Should be a lexicographical comparison + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void searchComparisonMixedInputTest() { + String query = "index=index_A | search \"source\" < 2"; + String testFile = "src/test/resources/spath/spathTransformationTest_numeric2*.json"; + + streamingTestUtil.performDPLTest(query, testFile, ds -> { + List json = ds + .select("source") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays + .asList( + "127.0.0.0", "127.1.1.1", "127.2.2.2", "127.3.3.3", "127.4.4.4", "127.5.5.5", "127.6.6.6", + "127.7.7.7", "127.8.8.8", "127.9.9.9" + ); + + assertEquals(expected, json); + }); + } } - diff --git a/src/test/java/com/teragrep/pth10/SendemailTransformationTest.java b/src/test/java/com/teragrep/pth10/SendemailTransformationTest.java index eea5a0e..c822282 100644 --- a/src/test/java/com/teragrep/pth10/SendemailTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/SendemailTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -45,7 +45,6 @@ */ package com.teragrep.pth10; - import com.icegreen.greenmail.junit5.GreenMailExtension; import com.icegreen.greenmail.util.ServerSetup; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -73,257 +72,271 @@ import static org.junit.jupiter.api.Assertions.*; - /** * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class SendemailTransformationTest { - private static final Logger LOGGER = LoggerFactory.getLogger(SendemailTransformationTest.class); - - private final String testFile = "src/test/resources/sendemailTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); - - private StreamingTestUtil streamingTestUtil; - - @org.junit.jupiter.api.BeforeAll - void setEnv() throws IOException { - this.streamingTestUtil = new StreamingTestUtil(this.testSchema); - this.streamingTestUtil.setEnv(); - } - - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - - DPLParserCatalystContext ctx = this.streamingTestUtil.getCtx(); - DPLParserCatalystVisitor visitor = this.streamingTestUtil.getCatalystVisitor(); - - // set path for join cmd - visitor.setHdfsPath("/tmp/pth_10/" + UUID.randomUUID()); - - // set paragraph url - ctx.setBaseUrl("http://teragrep.test"); - ctx.setNotebookUrl("NoteBookID"); - ctx.setParagraphUrl("ParaGraphID"); - - greenMail.start(); - } - - @org.junit.jupiter.api.AfterEach - void tearDown() { - greenMail.stop(); - this.streamingTestUtil.tearDown(); - } - - @RegisterExtension - static GreenMailExtension greenMail = new GreenMailExtension(new ServerSetup(2525, "localhost", "smtp")); - - - // ---------------------------------------- - // Tests - // ---------------------------------------- - - // basic email without results, no aggregations - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void sendemail_test_1() { - // Perform DPL query with streaming data - streamingTestUtil.performDPLTest( - "index=index_A | sendemail to=exa@mple.test from=from@example.test cc=cc@example.test server=localhost:2525", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - - // Get message - MimeMessage msg = greenMail.getReceivedMessages()[0]; - String msgStr = assertDoesNotThrow(() -> msgToString(msg)); - - // Get toEmails and subject. - String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); - String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); - String cc = assertDoesNotThrow(() -> msg.getHeader("cc")[0]); - String from = assertDoesNotThrow(() -> msg.getHeader("from")[0]); - - // Assertions - assertTrue(msgStr.contains("Search complete.")); - assertEquals(1, toEmails.length); - assertEquals("exa@mple.test", toEmails[0]); - assertEquals("cc@example.test", cc); - assertEquals("from@example.test", from); - assertEquals("Teragrep Results", subject); - } - - // basic email with two preceding eval commands - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void sendemail_test_2() { - // Perform DPL query with streaming data - streamingTestUtil.performDPLTest( - "index=index_A | eval extraField=null() | eval oneMoreField=true() | sendemail to=\"exa@mple.test\" server=localhost:2525", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, extraField, oneMoreField]", - Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - - // Get message - MimeMessage msg = greenMail.getReceivedMessagesForDomain("exa@mple.test")[0]; - String msgStr = assertDoesNotThrow(() -> msgToString(msg)); - - // Get toEmails and subject. - String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); - String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); - - // Assertions - assertTrue(msgStr.contains("Search complete.")); - assertEquals(1, toEmails.length); - assertEquals("exa@mple.test", toEmails[0]); - assertEquals("Teragrep Results", subject); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void sendemail_test_3() { - // Perform DPL query with streaming data - streamingTestUtil.performDPLTest( - "index=index_A | chart avg(offset) as avgo | chart avg(avgo) as resultssss | sendemail to=\"exa@mple.test\" sendresults=true inline=true sendpdf=true format=csv server=localhost:2525 ", - testFile, - ds -> { - - } - ); - - // Get message - MimeMessage msg = greenMail.getReceivedMessagesForDomain("exa@mple.test")[0]; - String msgStr = assertDoesNotThrow(() -> msgToString(msg)); - - // Get toEmails and subject. - String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); - String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); - - // Assertions - assertTrue(msgStr.contains("Search results.")); - - // if message contains the column headers like this it will contain the csv too - assertTrue(msgStr.contains("result")); - assertEquals(1, toEmails.length); - assertEquals("exa@mple.test", toEmails[0]); - assertEquals("Teragrep Results", subject); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void sendemail_test_4() { - // Perform DPL query with streaming data - streamingTestUtil.performDPLTest( - "index=index_A | sendemail to=\"exa@mple.test\" subject=\"Custom subject\" sendresults=true inline=true format=csv server=localhost:2525", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset]", - Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - - // Get message - MimeMessage msg = greenMail.getReceivedMessagesForDomain("exa@mple.test")[0]; - String msgStr = assertDoesNotThrow(() -> msgToString(msg)); - - // Get toEmails and subject.; - String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); - String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); - - // Assertions - assertTrue(msgStr.contains("Search results.")); - - // if message contains the column headers like this it will contain the csv too - assertTrue(msgStr.contains("_time,id,_raw,index,sourcetype,host,source,partition,offset")); - assertEquals(1, toEmails.length); - assertEquals("exa@mple.test", toEmails[0]); - assertEquals("Custom subject", subject); - } - - // pipe where after stats, then send email - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void sendemail_test_5() { - // Perform DPL query with streaming data - streamingTestUtil.performDPLTest( - "index=index_A | stats avg(offset) as avgo count(offset) as co | where co > 1 | sendemail to=\"exa@mple.test\" server=localhost:2525", - testFile, - ds -> { - assertEquals("[avgo, co]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - - // Get message - MimeMessage msg = greenMail.getReceivedMessagesForDomain("exa@mple.test")[0]; - String msgStr = assertDoesNotThrow(() -> msgToString(msg)); - - // Get toEmails and subject. - String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); - String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); - - // Assertions - assertTrue(msgStr.contains("Search complete.")); - assertEquals(1, toEmails.length); - assertEquals("exa@mple.test", toEmails[0]); - assertEquals("Teragrep Results", subject); - } - - // empty resultset must not send email - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void sendemailTestEmptyResultset() { - // Perform DPL query with streaming data - streamingTestUtil.performDPLTest( - "index=index_A" + - "|chart count(_raw) as craw" + - "|where craw < 0 " + // filter out all - "|sendemail to=\"1@example.com\" server=localhost:2525", - testFile, - ds -> { - // returns empty dataframe, but has column names present - assertEquals("[craw]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - - // must not send any message - assertEquals(0, greenMail.getReceivedMessagesForDomain("1@example.com").length); - } - - // ---------------------------------------- - // Helper methods - // ---------------------------------------- - - private String msgToString(MimeMessage mimeMsg) throws MessagingException { - String text = new BufferedReader( - new InputStreamReader(mimeMsg.getRawInputStream(), StandardCharsets.UTF_8)) - .lines() - .collect(Collectors.joining("\n")); - - return text; - } - -} \ No newline at end of file + + private static final Logger LOGGER = LoggerFactory.getLogger(SendemailTransformationTest.class); + + private final String testFile = "src/test/resources/sendemailTransformationTest_data*.json"; // * to make the path into a directory path + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); + + private StreamingTestUtil streamingTestUtil; + + @org.junit.jupiter.api.BeforeAll + void setEnv() throws IOException { + this.streamingTestUtil = new StreamingTestUtil(this.testSchema); + this.streamingTestUtil.setEnv(); + } + + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + + DPLParserCatalystContext ctx = this.streamingTestUtil.getCtx(); + DPLParserCatalystVisitor visitor = this.streamingTestUtil.getCatalystVisitor(); + + // set path for join cmd + visitor.setHdfsPath("/tmp/pth_10/" + UUID.randomUUID()); + + // set paragraph url + ctx.setBaseUrl("http://teragrep.test"); + ctx.setNotebookUrl("NoteBookID"); + ctx.setParagraphUrl("ParaGraphID"); + + greenMail.start(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + greenMail.stop(); + this.streamingTestUtil.tearDown(); + } + + @RegisterExtension + static GreenMailExtension greenMail = new GreenMailExtension(new ServerSetup(2525, "localhost", "smtp")); + + // ---------------------------------------- + // Tests + // ---------------------------------------- + + // basic email without results, no aggregations + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void sendemail_test_1() { + // Perform DPL query with streaming data + streamingTestUtil + .performDPLTest( + "index=index_A | sendemail to=exa@mple.test from=from@example.test cc=cc@example.test server=localhost:2525", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + + // Get message + MimeMessage msg = greenMail.getReceivedMessages()[0]; + String msgStr = assertDoesNotThrow(() -> msgToString(msg)); + + // Get toEmails and subject. + String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); + String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); + String cc = assertDoesNotThrow(() -> msg.getHeader("cc")[0]); + String from = assertDoesNotThrow(() -> msg.getHeader("from")[0]); + + // Assertions + assertTrue(msgStr.contains("Search complete.")); + assertEquals(1, toEmails.length); + assertEquals("exa@mple.test", toEmails[0]); + assertEquals("cc@example.test", cc); + assertEquals("from@example.test", from); + assertEquals("Teragrep Results", subject); + } + + // basic email with two preceding eval commands + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void sendemail_test_2() { + // Perform DPL query with streaming data + streamingTestUtil + .performDPLTest( + "index=index_A | eval extraField=null() | eval oneMoreField=true() | sendemail to=\"exa@mple.test\" server=localhost:2525", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, extraField, oneMoreField]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + + // Get message + MimeMessage msg = greenMail.getReceivedMessagesForDomain("exa@mple.test")[0]; + String msgStr = assertDoesNotThrow(() -> msgToString(msg)); + + // Get toEmails and subject. + String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); + String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); + + // Assertions + assertTrue(msgStr.contains("Search complete.")); + assertEquals(1, toEmails.length); + assertEquals("exa@mple.test", toEmails[0]); + assertEquals("Teragrep Results", subject); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void sendemail_test_3() { + // Perform DPL query with streaming data + streamingTestUtil + .performDPLTest( + "index=index_A | chart avg(offset) as avgo | chart avg(avgo) as resultssss | sendemail to=\"exa@mple.test\" sendresults=true inline=true sendpdf=true format=csv server=localhost:2525 ", + testFile, ds -> { + + } + ); + + // Get message + MimeMessage msg = greenMail.getReceivedMessagesForDomain("exa@mple.test")[0]; + String msgStr = assertDoesNotThrow(() -> msgToString(msg)); + + // Get toEmails and subject. + String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); + String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); + + // Assertions + assertTrue(msgStr.contains("Search results.")); + + // if message contains the column headers like this it will contain the csv too + assertTrue(msgStr.contains("result")); + assertEquals(1, toEmails.length); + assertEquals("exa@mple.test", toEmails[0]); + assertEquals("Teragrep Results", subject); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void sendemail_test_4() { + // Perform DPL query with streaming data + streamingTestUtil + .performDPLTest( + "index=index_A | sendemail to=\"exa@mple.test\" subject=\"Custom subject\" sendresults=true inline=true format=csv server=localhost:2525", + testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + + // Get message + MimeMessage msg = greenMail.getReceivedMessagesForDomain("exa@mple.test")[0]; + String msgStr = assertDoesNotThrow(() -> msgToString(msg)); + + // Get toEmails and subject.; + String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); + String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); + + // Assertions + assertTrue(msgStr.contains("Search results.")); + + // if message contains the column headers like this it will contain the csv too + assertTrue(msgStr.contains("_time,id,_raw,index,sourcetype,host,source,partition,offset")); + assertEquals(1, toEmails.length); + assertEquals("exa@mple.test", toEmails[0]); + assertEquals("Custom subject", subject); + } + + // pipe where after stats, then send email + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void sendemail_test_5() { + // Perform DPL query with streaming data + streamingTestUtil + .performDPLTest( + "index=index_A | stats avg(offset) as avgo count(offset) as co | where co > 1 | sendemail to=\"exa@mple.test\" server=localhost:2525", + testFile, ds -> { + assertEquals( + "[avgo, co]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + + // Get message + MimeMessage msg = greenMail.getReceivedMessagesForDomain("exa@mple.test")[0]; + String msgStr = assertDoesNotThrow(() -> msgToString(msg)); + + // Get toEmails and subject. + String[] toEmails = assertDoesNotThrow(() -> msg.getHeader("to")); + String subject = assertDoesNotThrow(() -> msg.getHeader("subject")[0]); + + // Assertions + assertTrue(msgStr.contains("Search complete.")); + assertEquals(1, toEmails.length); + assertEquals("exa@mple.test", toEmails[0]); + assertEquals("Teragrep Results", subject); + } + + // empty resultset must not send email + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void sendemailTestEmptyResultset() { + // Perform DPL query with streaming data + streamingTestUtil.performDPLTest("index=index_A" + "|chart count(_raw) as craw" + "|where craw < 0 " + // filter out all + "|sendemail to=\"1@example.com\" server=localhost:2525", testFile, ds -> { + // returns empty dataframe, but has column names present + assertEquals("[craw]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); + } + ); + + // must not send any message + assertEquals(0, greenMail.getReceivedMessagesForDomain("1@example.com").length); + } + + // ---------------------------------------- + // Helper methods + // ---------------------------------------- + + private String msgToString(MimeMessage mimeMsg) throws MessagingException { + String text = new BufferedReader(new InputStreamReader(mimeMsg.getRawInputStream(), StandardCharsets.UTF_8)) + .lines() + .collect(Collectors.joining("\n")); + + return text; + } + +} diff --git a/src/test/java/com/teragrep/pth10/SortTransformationTest.java b/src/test/java/com/teragrep/pth10/SortTransformationTest.java index f2fbb2a..13b0bde 100644 --- a/src/test/java/com/teragrep/pth10/SortTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/SortTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,30 +61,28 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** - * Tests for the new ProcessingStack implementation - * Uses streaming datasets + * Tests for the new ProcessingStack implementation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class SortTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(SortTransformationTest.class); private final String testFile = "src/test/resources/sortTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -104,7 +102,6 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @@ -112,183 +109,206 @@ void tearDown() { // FIXME fix these when sort command is fixed on parser side // (spaces before fields or auto(), num(), etc.) @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // ascending auto sortByType with desc override + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // ascending auto sortByType with desc override public void sort_test_1() { - streamingTestUtil.performDPLTest( - "index=index_A | sort + auto(offset) desc", - testFile, - ds -> { - List listOfOffset = ds.select("offset").collectAsList(); - long firstOffset = listOfOffset.get(0).getLong(0); - long lastOffset = listOfOffset.get(listOfOffset.size()-1).getLong(0); - - assertEquals(10, firstOffset); - assertEquals(1, lastOffset); - } - ); + streamingTestUtil.performDPLTest("index=index_A | sort + auto(offset) desc", testFile, ds -> { + List listOfOffset = ds.select("offset").collectAsList(); + long firstOffset = listOfOffset.get(0).getLong(0); + long lastOffset = listOfOffset.get(listOfOffset.size() - 1).getLong(0); + + assertEquals(10, firstOffset); + assertEquals(1, lastOffset); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // override default/auto sorting + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // override default/auto sorting public void sort_test_1b() { - streamingTestUtil.performDPLTest( - "index=index_A | sort 10 + str(offset)", - testFile, - ds -> { - List listOfOffset = ds.select("offset").collectAsList(); - - assertEquals("[1, 10, 2, 3, 4, 5, 6, 7, 8, 9]", - Arrays.toString(listOfOffset.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil.performDPLTest("index=index_A | sort 10 + str(offset)", testFile, ds -> { + List listOfOffset = ds.select("offset").collectAsList(); + + assertEquals( + "[1, 10, 2, 3, 4, 5, 6, 7, 8, 9]", Arrays.toString(listOfOffset.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // descending sourcetype (pth03 parsing issue?, seems to be ascending) + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // descending sourcetype (pth03 parsing issue?, seems to be ascending) public void sort_test_2() { - streamingTestUtil.performDPLTest( - "index=index_A | sort limit=0 + sourcetype", - testFile, - ds -> { - List listOfSourcetype = ds.select("sourcetype").collectAsList(); - - assertEquals("[stream1, stream1, stream1, stream1, stream1, stream2, stream2, stream2, stream2, stream2]", - Arrays.toString(listOfSourcetype.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil.performDPLTest("index=index_A | sort limit=0 + sourcetype", testFile, ds -> { + List listOfSourcetype = ds.select("sourcetype").collectAsList(); + + assertEquals( + "[stream1, stream1, stream1, stream1, stream1, stream2, stream2, stream2, stream2, stream2]", + Arrays.toString(listOfSourcetype.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // descending sort by ip address type + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // descending sort by ip address type public void sort_test_3() { - streamingTestUtil.performDPLTest( - "index=index_A | sort - ip(source)", - testFile, - ds -> { - List listOfSource = ds.select("source").collectAsList(); - - assertEquals("[127.9.9.9, 127.8.8.8, 127.7.7.7, 127.6.6.6, 127.5.5.5, 127.4.4.4, 127.3.3.3, 127.2.2.2, 127.1.1.1, 127.0.0.0]", - Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil.performDPLTest("index=index_A | sort - ip(source)", testFile, ds -> { + List listOfSource = ds.select("source").collectAsList(); + + assertEquals( + "[127.9.9.9, 127.8.8.8, 127.7.7.7, 127.6.6.6, 127.5.5.5, 127.4.4.4, 127.3.3.3, 127.2.2.2, 127.1.1.1, 127.0.0.0]", + Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // sort with aggregate + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // sort with aggregate public void sort_test_4() { - streamingTestUtil.performDPLTest( - "index=index_A | stats count(offset) as count_offset avg(offset) as avg_offset by sourcetype | sort +num(avg_offset)", - testFile, - ds -> { - List listOfSource = ds.select("avg_offset").collectAsList(); - - assertEquals("[5.0, 6.0]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | stats count(offset) as count_offset avg(offset) as avg_offset by sourcetype | sort +num(avg_offset)", + testFile, ds -> { + List listOfSource = ds.select("avg_offset").collectAsList(); + + assertEquals( + "[5.0, 6.0]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // chained sort + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // chained sort public void sort_test_5() { - streamingTestUtil.performDPLTest( - "index=index_A | sort - num(offset)", - testFile, - ds -> { - List listOfSource = - ds.select("offset").collectAsList(); - - assertEquals("[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil.performDPLTest("index=index_A | sort - num(offset)", testFile, ds -> { + List listOfSource = ds.select("offset").collectAsList(); + + assertEquals( + "[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // sort with a group by aggregate, descending sort + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // sort with a group by aggregate, descending sort public void sort_test_6() { - streamingTestUtil.performDPLTest( - "index=index_A | stats max(offset) AS max_off by id | sort -num(max_off)", - testFile, - ds -> { - List listOfSource = - ds.select("max_off").collectAsList(); - - assertEquals("[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | stats max(offset) AS max_off by id | sort -num(max_off)", testFile, ds -> { + List listOfSource = ds.select("max_off").collectAsList(); + + assertEquals( + "[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]", + Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // sort with a group by aggregate, ascending sort + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // sort with a group by aggregate, ascending sort public void sort_test_7() { - streamingTestUtil.performDPLTest( - "index=index_A | stats max(offset) AS max_off by id | sort +num(max_off)", - testFile, - ds -> { - List listOfSource = - ds.select("max_off").collectAsList(); - - assertEquals("[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | stats max(offset) AS max_off by id | sort +num(max_off)", testFile, ds -> { + List listOfSource = ds.select("max_off").collectAsList(); + + assertEquals( + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]", + Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // sort with a group by aggregate with auto sort + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // sort with a group by aggregate with auto sort public void sort_test_8() { - streamingTestUtil.performDPLTest( - "index=index_A | stats max(offset) AS max_off by id | sort -auto(max_off)", - testFile, - ds -> { - List listOfSource = - ds.select("max_off").collectAsList(); - - assertEquals("[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | stats max(offset) AS max_off by id | sort -auto(max_off)", testFile, ds -> { + List listOfSource = ds.select("max_off").collectAsList(); + + assertEquals( + "[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]", + Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // auto sort after eval + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // auto sort after eval public void sort_test_9() { - streamingTestUtil.performDPLTest( - "index=index_A | eval a = offset + 4 | sort -auto(a)", - testFile, - ds -> { - List listOfSource = - ds.select("a").collectAsList(); - - assertEquals("[14, 13, 12, 11, 10, 9, 8, 7, 6, 5]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil.performDPLTest("index=index_A | eval a = offset + 4 | sort -auto(a)", testFile, ds -> { + List listOfSource = ds.select("a").collectAsList(); + + assertEquals( + "[14, 13, 12, 11, 10, 9, 8, 7, 6, 5]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // auto sort strings + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // auto sort strings public void sort_test_10() { - streamingTestUtil.performDPLTest( - "index=index_A | eval a = if ( offset < 6, \"abc\", \"bcd\") | sort +auto(a)", - testFile, - ds -> { - List listOfSource = - ds.select("a").collectAsList(); - - assertEquals("[[abc], [abc], [abc], [abc], [abc], [bcd], [bcd], [bcd], [bcd], [bcd]]", Arrays.toString(listOfSource.stream().map(r -> r.getList(0).toString()).toArray())); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | eval a = if ( offset < 6, \"abc\", \"bcd\") | sort +auto(a)", testFile, ds -> { + List listOfSource = ds.select("a").collectAsList(); + + assertEquals( + "[[abc], [abc], [abc], [abc], [abc], [bcd], [bcd], [bcd], [bcd], [bcd]]", + Arrays.toString(listOfSource.stream().map(r -> r.getList(0).toString()).toArray()) + ); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // auto sort ip addresses + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // auto sort ip addresses public void sort_test_11() { - streamingTestUtil.performDPLTest( - "index=index_A | sort +auto(source)", - testFile, - ds -> { - List listOfSource = - ds.select("source").collectAsList(); - - assertEquals("[127.0.0.0, 127.1.1.1, 127.2.2.2, 127.3.3.3, 127.4.4.4, 127.5.5.5, 127.6.6.6, 127.7.7.7, 127.8.8.8, 127.9.9.9]", Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray())); - } - ); + streamingTestUtil.performDPLTest("index=index_A | sort +auto(source)", testFile, ds -> { + List listOfSource = ds.select("source").collectAsList(); + + assertEquals( + "[127.0.0.0, 127.1.1.1, 127.2.2.2, 127.3.3.3, 127.4.4.4, 127.5.5.5, 127.6.6.6, 127.7.7.7, 127.8.8.8, 127.9.9.9]", + Arrays.toString(listOfSource.stream().map(r -> r.getAs(0).toString()).toArray()) + ); + }); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/SpathTransformationTest.java b/src/test/java/com/teragrep/pth10/SpathTransformationTest.java index eed6e07..b7e1aa2 100644 --- a/src/test/java/com/teragrep/pth10/SpathTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/SpathTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -64,28 +64,26 @@ import static org.junit.jupiter.api.Assertions.*; /** - * Tests for spath command - * Uses streaming datasets + * Tests for spath command Uses streaming datasets * * @author eemhu */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class SpathTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(SpathTransformationTest.class); - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -105,7 +103,6 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @@ -119,252 +116,381 @@ void tearDown() { final String INVALID_DATA = "src/test/resources/spath/spathTransformationTest_invalid*.json"; @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTestXml() { - streamingTestUtil.performDPLTest( - "index=index_A | spath input=_raw path=\"main.sub.item\"", - XML_DATA_1, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, main.sub.item]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - - String result = ds.select("`main.sub.item`").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("Hello world", result); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath input=_raw path=\"main.sub.item\"", XML_DATA_1, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, main.sub.item]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + + String result = ds + .select("`main.sub.item`") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("Hello world", result); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTestXmlWithOutput() { - streamingTestUtil.performDPLTest( - "index=index_A | spath input=_raw output=OUT path=\"main.sub.item\"", - XML_DATA_1, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, OUT]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - - String result = ds.select("OUT").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("Hello world", result); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | spath input=_raw output=OUT path=\"main.sub.item\"", XML_DATA_1, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, OUT]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + String result = ds + .select("OUT") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("Hello world", result); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTestXmlWithOutput_MultipleTagsOnSameLevel() { - streamingTestUtil.performDPLTest( - "index=index_A | spath input=_raw output=OUT path=\"main.sub[1].item\"", - XML_DATA_2, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, OUT]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - - String result = ds.select("OUT").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("Hello", result); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | spath input=_raw output=OUT path=\"main.sub[1].item\"", XML_DATA_2, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, OUT]", Arrays + .toString(ds.columns()), + "Batch handler dataset contained an unexpected column arrangement !" + ); + + String result = ds + .select("OUT") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("Hello", result); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTestJson() { - streamingTestUtil.performDPLTest( - "index=index_A | spath input=_raw path=json", - JSON_DATA_1, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, json]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - String result = ds.select("json").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("debugo", result); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath input=_raw path=json", JSON_DATA_1, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, json]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + String result = ds + .select("json") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("debugo", result); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTestJsonWithOutput() { - streamingTestUtil.performDPLTest( - "index=index_A | spath input=_raw output=OUT path=json", - JSON_DATA_1, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, OUT]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - String result = ds.select("OUT").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("debugo", result); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath input=_raw output=OUT path=json", JSON_DATA_1, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, OUT]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + String result = ds + .select("OUT") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("debugo", result); + }); } @Disabled - @Test + @Test // output without path is invalid syntax public void spathTestJsonNoPath() { - streamingTestUtil.performDPLTest( - "index=index_A | spath input=_raw output=OUT", - JSON_DATA_1, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, OUT]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - String result = ds.select("OUT").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("debugo\nxml", result); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath input=_raw output=OUT", JSON_DATA_1, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, OUT]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + String result = ds + .select("OUT") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("debugo\nxml", result); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void spathTestJsonInvalidInput() { - RuntimeException sqe = this.streamingTestUtil.performThrowingDPLTest(RuntimeException.class, "index=index_A | eval a = \"12.34\" | spath input=a", JSON_DATA_1, ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset]", Arrays.toString(ds.schema().fieldNames())); - }); + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void spathTestJsonInvalidInput() { + RuntimeException sqe = this.streamingTestUtil + .performThrowingDPLTest( + RuntimeException.class, "index=index_A | eval a = \"12.34\" | spath input=a", JSON_DATA_1, + ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset]", + Arrays.toString(ds.schema().fieldNames()) + ); + } + ); String causeStr = this.streamingTestUtil.getInternalCauseString(sqe.getCause(), IllegalStateException.class); assertEquals("Caused by: java.lang.IllegalStateException: Not a JSON Object: 12.34", causeStr); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // auto extract with xml and makeresults in front public void spathTestXmlWithMakeResultsAndAutoExtraction() { - streamingTestUtil.performDPLTest( - "| makeresults count=10 | eval a = \"
HelloWorld
\" | spath input=a", - XML_DATA_2, - ds -> { - assertEquals("[_time, a, main.sub]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - String result = ds.select("`main.sub`").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("Hello\nWorld", result); - } - ); + streamingTestUtil + .performDPLTest( + "| makeresults count=10 | eval a = \"
HelloWorld
\" | spath input=a", + XML_DATA_2, ds -> { + assertEquals( + "[_time, a, main.sub]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + String result = ds + .select("`main.sub`") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("Hello\nWorld", result); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTestAutoExtractionXml() { - streamingTestUtil.performDPLTest( - "index=index_A | spath", - XML_DATA_2, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, main.sub.item]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - String result = ds.select("`main.sub.item`").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("Hello\nHello2\n1", result); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath", XML_DATA_2, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, main.sub.item]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + String result = ds + .select("`main.sub.item`") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("Hello\nHello2\n1", result); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTestAutoExtractionJson() { - streamingTestUtil.performDPLTest( - "index=index_A | spath", - JSON_DATA_1, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, json, lil]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - String result = ds.select("lil").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("xml", result); - String result2 = ds.select("json").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("debugo", result2); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath", JSON_DATA_1, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, json, lil]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + String result = ds + .select("lil") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("xml", result); + String result2 = ds + .select("json") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("debugo", result2); + }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTestNestedJsonData() { - streamingTestUtil.performDPLTest( - "index=index_A | spath output=log path=.log", - JSON_DATA_NESTED, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, log]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - String result = ds.select("log").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("{\"auditID\":\"x\",\"requestURI\":\"/path\",\"user\":{\"name\":\"sys\",\"group\":[\"admins\",\"nosucherror\"]},\"method\":\"GET\",\"remoteAddr\":\"127.0.0.123:1025\",\"requestTimestamp\":\"2022-12-14T11:56:13Z\",\"responseTimestamp\":\"2022-12-14T11:56:13Z\",\"responseCode\":503,\"requestHeader\":{\"Accept-Encoding\":[\"gzip\"],\"User-Agent\":[\"Go-http-client/2.0\"]}}", result); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath output=log path=.log", JSON_DATA_NESTED, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, log]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + String result = ds + .select("log") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals( + "{\"auditID\":\"x\",\"requestURI\":\"/path\",\"user\":{\"name\":\"sys\",\"group\":[\"admins\",\"nosucherror\"]},\"method\":\"GET\",\"remoteAddr\":\"127.0.0.123:1025\",\"requestTimestamp\":\"2022-12-14T11:56:13Z\",\"responseTimestamp\":\"2022-12-14T11:56:13Z\",\"responseCode\":503,\"requestHeader\":{\"Accept-Encoding\":[\"gzip\"],\"User-Agent\":[\"Go-http-client/2.0\"]}}", + result + ); + }); } // FIXME: Seems like struck unescapes in eval, and the unescaped _raw is given to spath. @Disabled - @Test + @Test //@DisabledIfSystemProperty(named="skipSparkTest", matches="true") public void spathTestEvaledJsonData() { - streamingTestUtil.performDPLTest( - "| eval _raw = \"{\\\"kissa\\\" : \\\"fluff\\\"}\" | spath input=_raw output=otus path=kissa", - "empty _raw", - ds -> { - // TODO Assertions - } - ); + streamingTestUtil + .performDPLTest( + "| eval _raw = \"{\\\"kissa\\\" : \\\"fluff\\\"}\" | spath input=_raw output=otus path=kissa", + "empty _raw", ds -> { + // TODO Assertions + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTest_invalidInput() { - streamingTestUtil.performDPLTest( - "index=index_A | spath path=abc", - INVALID_DATA, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, abc]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - Object result = ds - .select("abc") - .dropDuplicates() - .collectAsList() - .stream().map(r -> r.getAs(0)) - .collect(Collectors.toList()).get(0); - assertNull(result); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath path=abc", INVALID_DATA, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, abc]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + Object result = ds + .select("abc") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0)) + .collect(Collectors.toList()) + .get(0); + assertNull(result); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTest_invalidInputAutoExtraction() { - streamingTestUtil.performDPLTest( - "index=index_A | spath", - INVALID_DATA, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath", INVALID_DATA, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTest_invalidInputManualExtraction() { - streamingTestUtil.performDPLTest( - "index=index_A | spath path=\"randomPathThatDoesNotExist\"", - INVALID_DATA, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, randomPathThatDoesNotExist]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); + streamingTestUtil + .performDPLTest("index=index_A | spath path=\"randomPathThatDoesNotExist\"", INVALID_DATA, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, randomPathThatDoesNotExist]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); // all should be nulls, so distinct() returns 1 row List rows = ds.select("randomPathThatDoesNotExist").distinct().collectAsList(); assertEquals(1, rows.size()); // make sure it is null assertEquals(streamingTestUtil.getCtx().nullValue.value(), rows.get(0).get(0)); - } - ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void spathTest_ImplicitPath() { - streamingTestUtil.performDPLTest( - "index=index_A | spath json", - JSON_DATA_1, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, json, lil]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - String json = ds.select("json").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - String lil = ds.select("lil").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()).get(0); - assertEquals("debugo", json); - assertEquals("xml", lil); - } - ); + streamingTestUtil.performDPLTest("index=index_A | spath json", JSON_DATA_1, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, json, lil]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + String json = ds + .select("json") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + String lil = ds + .select("lil") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("debugo", json); + assertEquals("xml", lil); + }); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/StackTest.java b/src/test/java/com/teragrep/pth10/StackTest.java index 2eef27e..6b36dc9 100644 --- a/src/test/java/com/teragrep/pth10/StackTest.java +++ b/src/test/java/com/teragrep/pth10/StackTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,156 +61,178 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** - * Tests for the new ProcessingStack implementation - * Uses streaming datasets + * Tests for the new ProcessingStack implementation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class StackTest { - private static final Logger LOGGER = LoggerFactory.getLogger(StackTest.class); - - private final String testFile = "src/test/resources/predictTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); - - private StreamingTestUtil streamingTestUtil; - - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(this.testSchema); - this.streamingTestUtil.setEnv(); - } - - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } - - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } - - - // ---------------------------------------- - // Tests - // ---------------------------------------- - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") /* chart -> chart */ - public void stackTest_Streaming_ChartChart() { - streamingTestUtil.performDPLTest( - "index=index_A | chart count(offset) as c_offset by partition | chart count(c_offset) as final", - testFile, - ds -> { - assertEquals(Arrays.toString(ds.columns()), "[final]", "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void stackTest_Streaming_1() { - streamingTestUtil.performDPLTest( - "index=index_A", - testFile, - ds -> { - assertEquals(Arrays.toString(ds.columns()), "[_time, id, _raw, index, sourcetype, host, source, partition, offset]", - "Batch handler dataset contained an unexpected column arrangement !"); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") /* eval */ - public void stackTest_Streaming_Eval() { - streamingTestUtil.performDPLTest( - "index=index_A | eval newField = offset * 5", - testFile, - ds -> { - assertEquals(Arrays.toString(ds.columns()), "[_time, id, _raw, index, sourcetype, host, source, partition, offset, newField]", - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") /* stats -> chart */ - public void stackTest_Streaming_StatsChart() { - streamingTestUtil.performDPLTest( - "index=index_A | stats count(_raw) as raw_count | chart count(raw_count) as count", - testFile, - ds -> { - assertEquals(Arrays.toString(ds.columns()), "[count]", - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") /* stats -> stats */ - public void stackTest_Streaming_StatsStats() { - streamingTestUtil.performDPLTest( - "index=index_A | stats avg(offset) as avg1 count(offset) as c_offset dc(offset) as dc | stats count(avg1) as c_avg count(c_offset) as c_count count(dc) as c_dc", - testFile, - ds -> { - assertEquals(Arrays.toString(ds.columns()), "[c_avg, c_count, c_dc]", - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") /* stats -> chart -> eval */ - public void stackTest_Streaming_StatsChartEval() { - streamingTestUtil.performDPLTest( - "index=index_A | stats avg(offset) as avg_offset | chart count(avg_offset) as c_avg_offset | eval final=c_avg_offset * 5", - testFile, - ds -> { - assertEquals(Arrays.toString(ds.columns()), "[c_avg_offset, final]", - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") /* eval -> eval -> eval -> stats -> chart */ - public void stackTest_Streaming_EvalEvalEvalStatsChart() { - streamingTestUtil.performDPLTest( - "index=index_A | eval a=exp(offset) | eval b=pow(a, 2) | eval x = a + b | stats var(x) as field | chart count(field) as final", - testFile, - ds -> { - assertEquals(Arrays.toString(ds.columns()), "[final]", - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - } - - // TODO: remove disabled annotation when pth-03 issue #125 is closed - // Fails because c is parsed as count() command, not a column - @Test - @Disabled(value = "requires parser fixes") - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") /* eval -> eval -> eval -> stats -> chart */ - public void stackTest_Streaming_EvalEvalEvalStatsChart_with_c() { - streamingTestUtil.performDPLTest( - "index=index_A | eval a=exp(offset) | eval b=pow(a, 2) | eval c = a + b | stats var(c) as field | chart count(field) as final", - testFile, - ds -> { - assertEquals(Arrays.toString(ds.columns()), "[final]", - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); - } + + private static final Logger LOGGER = LoggerFactory.getLogger(StackTest.class); + + private final String testFile = "src/test/resources/predictTransformationTest_data*.json"; // * to make the path into a directory path + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); + + private StreamingTestUtil streamingTestUtil; + + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(this.testSchema); + this.streamingTestUtil.setEnv(); + } + + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } + + // ---------------------------------------- + // Tests + // ---------------------------------------- + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) /* chart -> chart */ + public void stackTest_Streaming_ChartChart() { + streamingTestUtil + .performDPLTest( + "index=index_A | chart count(offset) as c_offset by partition | chart count(c_offset) as final", + testFile, ds -> { + assertEquals( + Arrays.toString(ds.columns()), "[final]", "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void stackTest_Streaming_1() { + streamingTestUtil.performDPLTest("index=index_A", testFile, ds -> { + assertEquals( + Arrays.toString(ds.columns()), "[_time, id, _raw, index, sourcetype, host, source, partition, offset]", "Batch handler dataset contained an unexpected column arrangement !" + ); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) /* eval */ + public void stackTest_Streaming_Eval() { + streamingTestUtil.performDPLTest("index=index_A | eval newField = offset * 5", testFile, ds -> { + assertEquals( + Arrays.toString(ds.columns()), "[_time, id, _raw, index, sourcetype, host, source, partition, offset, newField]", "Batch handler dataset contained an unexpected column arrangement !" + ); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) /* stats -> chart */ + public void stackTest_Streaming_StatsChart() { + streamingTestUtil + .performDPLTest( + "index=index_A | stats count(_raw) as raw_count | chart count(raw_count) as count", testFile, + ds -> { + assertEquals( + Arrays.toString(ds.columns()), "[count]", "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) /* stats -> stats */ + public void stackTest_Streaming_StatsStats() { + streamingTestUtil + .performDPLTest( + "index=index_A | stats avg(offset) as avg1 count(offset) as c_offset dc(offset) as dc | stats count(avg1) as c_avg count(c_offset) as c_count count(dc) as c_dc", + testFile, ds -> { + assertEquals( + Arrays.toString(ds.columns()), "[c_avg, c_count, c_dc]", "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) /* stats -> chart -> eval */ + public void stackTest_Streaming_StatsChartEval() { + streamingTestUtil + .performDPLTest( + "index=index_A | stats avg(offset) as avg_offset | chart count(avg_offset) as c_avg_offset | eval final=c_avg_offset * 5", + testFile, ds -> { + assertEquals( + Arrays.toString(ds.columns()), "[c_avg_offset, final]", "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) /* eval -> eval -> eval -> stats -> chart */ + public void stackTest_Streaming_EvalEvalEvalStatsChart() { + streamingTestUtil + .performDPLTest( + "index=index_A | eval a=exp(offset) | eval b=pow(a, 2) | eval x = a + b | stats var(x) as field | chart count(field) as final", + testFile, ds -> { + assertEquals( + Arrays.toString(ds.columns()), "[final]", "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + } + + // TODO: remove disabled annotation when pth-03 issue #125 is closed + // Fails because c is parsed as count() command, not a column + @Test + @Disabled(value = "requires parser fixes") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) /* eval -> eval -> eval -> stats -> chart */ + public void stackTest_Streaming_EvalEvalEvalStatsChart_with_c() { + streamingTestUtil + .performDPLTest( + "index=index_A | eval a=exp(offset) | eval b=pow(a, 2) | eval c = a + b | stats var(c) as field | chart count(field) as final", + testFile, ds -> { + assertEquals( + Arrays.toString(ds.columns()), "[final]", "Batch handler dataset contained an unexpected column arrangement !" + ); + } + ); + } } - diff --git a/src/test/java/com/teragrep/pth10/StrcatTransformationTest.java b/src/test/java/com/teragrep/pth10/StrcatTransformationTest.java index 6bae944..8f513a4 100644 --- a/src/test/java/com/teragrep/pth10/StrcatTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/StrcatTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -62,21 +62,20 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class StrcatTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(StrcatTransformationTest.class); private final String testFile = "src/test/resources/strcatTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -96,12 +95,14 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // --- Catalyst emit mode tests --- - - // strcat without allRequired parameter provided (defaults to allRequired=f) + + // strcat without allRequired parameter provided (defaults to allRequired=f) @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void strcatTransformTest() { String q = "index=index_A | strcat _raw sourcetype \"literal\" dest"; @@ -110,12 +111,21 @@ void strcatTransformTest() { assertTrue(Arrays.toString(res.columns()).contains("dest")); // List of expected values for the strcat destination field - List expectedValues = new ArrayList<>(Arrays.asList( - "raw 01A:X:0literal", "raw 02A:X:0literal", "raw 03A:Y:0literal", "raw 04A:Y:0literal", "raw 05A:Y:0literal" - )); + List expectedValues = new ArrayList<>( + Arrays + .asList( + "raw 01A:X:0literal", "raw 02A:X:0literal", "raw 03A:Y:0literal", + "raw 04A:Y:0literal", "raw 05A:Y:0literal" + ) + ); // Destination field from result dataset - List destAsList = res.select("dest").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + List destAsList = res + .select("dest") + .collectAsList() + .stream() + .map(r -> r.getString(0)) + .collect(Collectors.toList()); Collections.sort(expectedValues); Collections.sort(destAsList); @@ -124,24 +134,36 @@ void strcatTransformTest() { assertEquals(expectedValues, destAsList); }); } - + // strcat with allRequired=True @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void strcatTransformAllRequiredTrueTest() { String q = "index=index_A | strcat allrequired=t _raw \"literal\" sourcetype dest"; - + streamingTestUtil.performDPLTest(q, testFile, res -> { // check if result contains the column that was created for strcat result assertTrue(Arrays.toString(res.columns()).contains("dest")); // List of expected values for the strcat destination field - List expectedValues = new ArrayList<>(Arrays.asList( - "raw 01literalA:X:0", "raw 02literalA:X:0", "raw 03literalA:Y:0", "raw 04literalA:Y:0", "raw 05literalA:Y:0" - )); + List expectedValues = new ArrayList<>( + Arrays + .asList( + "raw 01literalA:X:0", "raw 02literalA:X:0", "raw 03literalA:Y:0", + "raw 04literalA:Y:0", "raw 05literalA:Y:0" + ) + ); // Destination field from result dataset - List destAsList = res.select("dest").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + List destAsList = res + .select("dest") + .collectAsList() + .stream() + .map(r -> r.getString(0)) + .collect(Collectors.toList()); Collections.sort(expectedValues); Collections.sort(destAsList); @@ -150,10 +172,13 @@ void strcatTransformAllRequiredTrueTest() { assertEquals(expectedValues, destAsList); }); } - + // strcat with allRequired=False @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void strcatTransformAllRequiredFalseTest() { String q = "index=index_A | strcat allrequired=f _raw sourcetype \"hello world\" dest"; @@ -162,12 +187,21 @@ void strcatTransformAllRequiredFalseTest() { assertTrue(Arrays.toString(res.columns()).contains("dest")); // List of expected values for the strcat destination field - List expectedValues = new ArrayList<>(Arrays.asList( - "raw 01A:X:0hello world", "raw 02A:X:0hello world", "raw 03A:Y:0hello world", "raw 04A:Y:0hello world", "raw 05A:Y:0hello world" - )); + List expectedValues = new ArrayList<>( + Arrays + .asList( + "raw 01A:X:0hello world", "raw 02A:X:0hello world", "raw 03A:Y:0hello world", + "raw 04A:Y:0hello world", "raw 05A:Y:0hello world" + ) + ); // Destination field from result dataset - List destAsList = res.select("dest").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + List destAsList = res + .select("dest") + .collectAsList() + .stream() + .map(r -> r.getString(0)) + .collect(Collectors.toList()); Collections.sort(expectedValues); Collections.sort(destAsList); @@ -177,33 +211,40 @@ void strcatTransformAllRequiredFalseTest() { }); } - + // strcat with allRequired=True AND missing(incorrect) field @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void strcatTransformAllRequiredTrueWithMissingFieldTest() { String q = "index=index_A | strcat allrequired=t _raw sourcetype NOT_A_REAL_FIELD \"literal\" dest"; - streamingTestUtil.performDPLTest(q, testFile, res -> { - // check if result contains the column that was created for strcat result - assertTrue(Arrays.toString(res.columns()).contains("dest")); + streamingTestUtil + .performDPLTest( + q, testFile, res -> { + // check if result contains the column that was created for strcat result + assertTrue(Arrays.toString(res.columns()).contains("dest")); - // List of expected values for the strcat destination field - List expectedValues = new ArrayList<>(Arrays.asList( - null, null, null, null, null - )); + // List of expected values for the strcat destination field + List expectedValues = new ArrayList<>(Arrays.asList(null, null, null, null, null)); - // Destination field from result dataset - List destAsList = res.select("dest").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + // Destination field from result dataset + List destAsList = res.select("dest").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); - // assert dest field contents as equals with expected contents - assertEquals(expectedValues, destAsList); - }); + // assert dest field contents as equals with expected contents + assertEquals(expectedValues, destAsList); + } + ); } - - // strcat with allRequired=False AND missing(incorrect) field + + // strcat with allRequired=False AND missing(incorrect) field @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void strcatTransformAllRequiredFalseWithMissingFieldTest() { String q = "index=index_A | strcat allrequired=f _raw sourcetype \"literal\" NOT_A_REAL_FIELD dest"; @@ -212,12 +253,21 @@ void strcatTransformAllRequiredFalseWithMissingFieldTest() { assertTrue(Arrays.toString(res.columns()).contains("dest")); // List of expected values for the strcat destination field - List expectedValues = new ArrayList<>(Arrays.asList( - "raw 01A:X:0literal", "raw 02A:X:0literal", "raw 03A:Y:0literal", "raw 04A:Y:0literal", "raw 05A:Y:0literal" - )); + List expectedValues = new ArrayList<>( + Arrays + .asList( + "raw 01A:X:0literal", "raw 02A:X:0literal", "raw 03A:Y:0literal", + "raw 04A:Y:0literal", "raw 05A:Y:0literal" + ) + ); // Destination field from result dataset - List destAsList = res.select("dest").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + List destAsList = res + .select("dest") + .collectAsList() + .stream() + .map(r -> r.getString(0)) + .collect(Collectors.toList()); Collections.sort(expectedValues); Collections.sort(destAsList); @@ -226,10 +276,13 @@ void strcatTransformAllRequiredFalseWithMissingFieldTest() { assertEquals(expectedValues, destAsList); }); } - + // strcat with allRequired=False AND three fields and two literals @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void strcatTransformWithMoreThanTwoFields() { String q = "index=index_A | strcat allrequired=f _raw \",\" sourcetype \",\" index dest"; @@ -238,12 +291,21 @@ void strcatTransformWithMoreThanTwoFields() { assertTrue(Arrays.toString(res.columns()).contains("dest")); // List of expected values for the strcat destination field - List expectedValues = new ArrayList<>(Arrays.asList( - "raw 01,A:X:0,index_A", "raw 02,A:X:0,index_A", "raw 03,A:Y:0,index_A", "raw 04,A:Y:0,index_A", "raw 05,A:Y:0,index_A" - )); + List expectedValues = new ArrayList<>( + Arrays + .asList( + "raw 01,A:X:0,index_A", "raw 02,A:X:0,index_A", "raw 03,A:Y:0,index_A", + "raw 04,A:Y:0,index_A", "raw 05,A:Y:0,index_A" + ) + ); // Destination field from result dataset - List destAsList = res.select("dest").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + List destAsList = res + .select("dest") + .collectAsList() + .stream() + .map(r -> r.getString(0)) + .collect(Collectors.toList()); Collections.sort(expectedValues); Collections.sort(destAsList); diff --git a/src/test/java/com/teragrep/pth10/StreamingTestUtil.java b/src/test/java/com/teragrep/pth10/StreamingTestUtil.java index 81c7033..a11935b 100644 --- a/src/test/java/com/teragrep/pth10/StreamingTestUtil.java +++ b/src/test/java/com/teragrep/pth10/StreamingTestUtil.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10; import com.teragrep.pth10.ast.DPLAuditInformation; @@ -62,8 +61,6 @@ import org.apache.spark.sql.streaming.StreamingQuery; import org.apache.spark.sql.streaming.StreamingQueryException; import org.apache.spark.sql.types.StructType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.File; import java.util.Arrays; @@ -77,10 +74,11 @@ import static org.junit.jupiter.api.Assertions.*; /** - StreamingTestUtil is used to perform DPL queries in streaming tests. - Also has functions for setting up the test and tearing down, for use in BeforeEach-annotations etc. + * StreamingTestUtil is used to perform DPL queries in streaming tests. Also has functions for setting up the test and + * tearing down, for use in BeforeEach-annotations etc. **/ public class StreamingTestUtil { + private DPLParserCatalystContext ctx; private DPLParserCatalystVisitor catalystVisitor; private StructType schema; @@ -96,6 +94,7 @@ public StreamingTestUtil() { /** * Constructor with a schema parameter. If no schema is given, the dataframe's columns won't be in the same order. * If the test needs to take column order into account, using this constructor is crucial. + * * @param schema schema of the test file */ public StreamingTestUtil(StructType schema) { @@ -106,6 +105,7 @@ public StreamingTestUtil(StructType schema) { /** * Set to fail tests if ANTLR encounters any lexing or parsing errors, even if it can auto-recover. + * * @param strictParserMode Fail tests on any parsing error */ public void setStrictParserMode(boolean strictParserMode) { @@ -114,6 +114,7 @@ public void setStrictParserMode(boolean strictParserMode) { /** * Returns the boolean value indicating if strict parser mode is enabled. + * * @return boolean value indicating if strict parser mode is enabled. */ public boolean isStrictParserMode() { @@ -122,6 +123,7 @@ public boolean isStrictParserMode() { /** * Set to print parse tree to System.out + * * @param printParseTree boolean value */ public void setPrintParseTree(boolean printParseTree) { @@ -130,6 +132,7 @@ public void setPrintParseTree(boolean printParseTree) { /** * Indicates if parse tree is print on test run + * * @return boolean value */ public boolean isPrintParseTree() { @@ -190,50 +193,76 @@ void tearDown() { /** * Performs a DPL query. - * @param query DPL Query - * @param assertions The function to run on the result dataset. Should contain assertions. - * @param testDirectory Directory path to specify which data to use in the test. + * + * @param query DPL Query + * @param assertions The function to run on the result dataset. Should contain assertions. + * @param testDirectory Directory path to specify which data to use in the test. * @param dataCustomizations Function to apply any data customizations for special cases such as relative timestamp * tests, where the time column of the data needs to change in order to keep the tests * functional over a longer period of time. */ - public void performDPLTest(String query, String testDirectory, Function, Dataset> dataCustomizations, - Consumer> assertions) { - assertThrowsDPLTest(false, null, query, testDirectory, dataCustomizations, assertions); + public void performDPLTest( + String query, + String testDirectory, + Function, Dataset> dataCustomizations, + Consumer> assertions + ) { + assertThrowsDPLTest(false, null, query, testDirectory, dataCustomizations, assertions); } - public T performThrowingDPLTest(Class clazz, String query, String testDirectory, Function, Dataset> dataCustomizations, - Consumer> assertions) { + public T performThrowingDPLTest( + Class clazz, + String query, + String testDirectory, + Function, Dataset> dataCustomizations, + Consumer> assertions + ) { return assertThrowsDPLTest(true, clazz, query, testDirectory, dataCustomizations, assertions); } - public T performThrowingDPLTest(Class clazz, String query, String testDirectory, - Consumer> assertions) { + public T performThrowingDPLTest( + Class clazz, + String query, + String testDirectory, + Consumer> assertions + ) { return assertThrowsDPLTest(true, clazz, query, testDirectory, (ds) -> ds, assertions); } /** * Performs a DPL query, without any special data customizations. - * @param query DPL Query - * @param assertions The function to run on the result dataset. Should contain assertions. + * + * @param query DPL Query + * @param assertions The function to run on the result dataset. Should contain assertions. * @param testDirectory Directory path to specify which data to use in the test. */ public void performDPLTest(String query, String testDirectory, Consumer> assertions) { assertThrowsDPLTest(false, null, query, testDirectory, (ds) -> ds, assertions); } - private T assertThrowsDPLTest(boolean doesThrow, Class clazz, String query, String testDirectory, Function, Dataset> dataCustomizations, - Consumer> assertions) { + private T assertThrowsDPLTest( + boolean doesThrow, + Class clazz, + String query, + String testDirectory, + Function, Dataset> dataCustomizations, + Consumer> assertions + ) { if (doesThrow) { return assertThrows(clazz, () -> internalDPLTest(query, testDirectory, dataCustomizations, assertions)); - } else { + } + else { assertDoesNotThrow(() -> internalDPLTest(query, testDirectory, dataCustomizations, assertions)); } return null; } - private void internalDPLTest(String query, String testDirectory, Function, Dataset> dataCustomizations, - Consumer> assertions) throws TimeoutException { + private void internalDPLTest( + String query, + String testDirectory, + Function, Dataset> dataCustomizations, + Consumer> assertions + ) throws TimeoutException { if (this.catalystVisitor == null) { throw new NullPointerException("StreamingTestUtil's CatalystVisitor is null: setUp wasn't called"); } @@ -245,7 +274,8 @@ private void internalDPLTest(String query, String testDirectory, Function recognizer, Object offendingSymbol, - int line, int charPosInLine, String msg, RecognitionException e){ + public void syntaxError( + Recognizer recognizer, + Object offendingSymbol, + int line, + int charPosInLine, + String msg, + RecognitionException e + ) { fail(String.format("Lexer error at line %s:%s due to %s %s", line, charPosInLine, msg, e)); } }); @@ -281,10 +318,17 @@ public void syntaxError(Recognizer recognizer, Object offendingSymbol, // parser init DPLParser parser = new DPLParser(new CommonTokenStream(lexer)); if (this.strictParserMode) { - parser.addErrorListener(new BaseErrorListener(){ + parser.addErrorListener(new BaseErrorListener() { + @Override - public void syntaxError(Recognizer recognizer, Object offendingSymbol, - int line, int charPosInLine, String msg, RecognitionException e){ + public void syntaxError( + Recognizer recognizer, + Object offendingSymbol, + int line, + int charPosInLine, + String msg, + RecognitionException e + ) { fail(String.format("Parser error at line %s:%s due to %s %s", line, charPosInLine, msg, e)); } }); @@ -301,7 +345,8 @@ public void syntaxError(Recognizer recognizer, Object offendingSymbol, DataStreamWriter dsw; try { dsw = n.stepList.execute(); - } catch (StreamingQueryException e) { + } + catch (StreamingQueryException e) { throw new RuntimeException(e); } @@ -317,9 +362,9 @@ public void syntaxError(Recognizer recognizer, Object offendingSymbol, } /** - * Returns the internal cause string matching the given exception - * or if it was not found, throws RuntimeException. - * @param cause StreamingQueryException's cause Throwable + * Returns the internal cause string matching the given exception or if it was not found, throws RuntimeException. + * + * @param cause StreamingQueryException's cause Throwable * @param internalException Class type of expected Exception * @throws RuntimeException If cause string could not be found * @return "Caused by: java.lang.Exception: Message" type string @@ -328,12 +373,14 @@ public String getInternalCauseString(final Throwable cause, final Class. + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -62,73 +62,82 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class SubsearchStreamingTest { - private static final Logger LOGGER = LoggerFactory.getLogger(SubsearchStreamingTest.class); - private final String testFile = "src/test/resources/strcatTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); - private StreamingTestUtil streamingTestUtil; + private static final Logger LOGGER = LoggerFactory.getLogger(SubsearchStreamingTest.class); + private final String testFile = "src/test/resources/strcatTransformationTest_data*.json"; // * to make the path into a directory path + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(this.testSchema); - this.streamingTestUtil.setEnv(); - } + private StreamingTestUtil streamingTestUtil; - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(this.testSchema); + this.streamingTestUtil.setEnv(); + } - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } - - - // ---------------------------------------- - // Tests - // ---------------------------------------- + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void subsearchStreamingTest1() { - streamingTestUtil.performDPLTest( - "index=index_A [ search sourcetype=A:X:0 | fields offset ] ", - testFile, - ds -> { - List expectedValues = Arrays.asList("raw 01", "raw 02"); - List listOfResult = ds.select("_raw").distinct().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - Collections.sort(expectedValues); - Collections.sort(listOfResult); - assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); - }); + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } - } + // ---------------------------------------- + // Tests + // ---------------------------------------- - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void subsearchStreamingTest2() { - streamingTestUtil.performDPLTest( - "[ search sourcetype=A:X:0 | fields offset ] ", - testFile, - ds -> { - List expectedValues = Collections.emptyList(); - List listOfResult = ds.select("_raw").distinct().collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - Collections.sort(listOfResult); - assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); - }); - } + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void subsearchStreamingTest1() { + streamingTestUtil.performDPLTest("index=index_A [ search sourcetype=A:X:0 | fields offset ] ", testFile, ds -> { + List expectedValues = Arrays.asList("raw 01", "raw 02"); + List listOfResult = ds + .select("_raw") + .distinct() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + Collections.sort(expectedValues); + Collections.sort(listOfResult); + assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); + }); + + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void subsearchStreamingTest2() { + streamingTestUtil.performDPLTest("[ search sourcetype=A:X:0 | fields offset ] ", testFile, ds -> { + List expectedValues = Collections.emptyList(); + List listOfResult = ds + .select("_raw") + .distinct() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + Collections.sort(listOfResult); + assertEquals(expectedValues, listOfResult, "Batch consumer dataset did not contain the expected values !"); + }); + } } - diff --git a/src/test/java/com/teragrep/pth10/SyslogStreamTest.java b/src/test/java/com/teragrep/pth10/SyslogStreamTest.java index 09f2113..a03e0f0 100644 --- a/src/test/java/com/teragrep/pth10/SyslogStreamTest.java +++ b/src/test/java/com/teragrep/pth10/SyslogStreamTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -66,29 +66,27 @@ import static org.junit.jupiter.api.Assertions.*; /** - * Tests for | teragrep exec syslog stream - * Uses streaming datasets + * Tests for | teragrep exec syslog stream Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class SyslogStreamTest { + private static final Logger LOGGER = LoggerFactory.getLogger(SyslogStreamTest.class); private final String testFile = "src/test/resources/regexTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -108,14 +106,16 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- - @Disabled (value = "RLP-03 has to be updated") /* FIXME: Update rlp_03 library to work with new rlp_01 version! */ + @Disabled(value = "RLP-03 has to be updated") /* FIXME: Update rlp_03 library to work with new rlp_01 version! */ @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // teragrep exec syslog stream + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // teragrep exec syslog stream public void syslogStreamSendingTest() { final int expectedSyslogs = 10; AtomicInteger numberOfSyslogMessagesSent = new AtomicInteger(); @@ -131,35 +131,41 @@ public void syslogStreamSendingTest() { final Server server = new Server(port, new SyslogFrameProcessor(cbFunction)); assertDoesNotThrow(server::start); - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec syslog stream host 127.0.0.1 port " + port, - testFile, - ds -> { - LOGGER.debug("Syslog msgs = <{}>", numberOfSyslogMessagesSent.get()); - assertEquals(expectedSyslogs, numberOfSyslogMessagesSent.get()); - - for (int i = 0; i < expectedSyslogs; i++) { - String s = arrayOfSyslogs.get(i); - for (int j = 0; j < expectedSyslogs; j++) { - if (i == j) continue; - assertFalse(arrayOfSyslogs.compareAndSet(j, s, s)); + streamingTestUtil + .performDPLTest( + "index=index_A | teragrep exec syslog stream host 127.0.0.1 port " + port, testFile, ds -> { + LOGGER.debug("Syslog msgs = <{}>", numberOfSyslogMessagesSent.get()); + assertEquals(expectedSyslogs, numberOfSyslogMessagesSent.get()); + + for (int i = 0; i < expectedSyslogs; i++) { + String s = arrayOfSyslogs.get(i); + for (int j = 0; j < expectedSyslogs; j++) { + if (i == j) + continue; + assertFalse(arrayOfSyslogs.compareAndSet(j, s, s)); + } + + } + assertAll("stop server", server::stop); } - - } - assertAll("stop server", server::stop); - } - ); + ); } @Disabled(value = "RLP-03 has to be updated") // FIXME: update rlp_03 @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") // teragrep exec syslog stream, with preceding aggregation command + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) // teragrep exec syslog stream, with preceding aggregation command public void syslogStreamSendingFailureTest() { - assertThrows(StreamingQueryException.class, () -> streamingTestUtil.performDPLTest( - "index=index_A | stats count(_raw) as craw | teragrep exec syslog stream host 127.0.0.1 port 9998", - testFile, - ds -> { - } - )); + assertThrows( + StreamingQueryException.class, + () -> streamingTestUtil + .performDPLTest( + "index=index_A | stats count(_raw) as craw | teragrep exec syslog stream host 127.0.0.1 port 9998", + testFile, ds -> { + } + ) + ); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/TableTransformationTest.java b/src/test/java/com/teragrep/pth10/TableTransformationTest.java index c4b438b..89b38fc 100644 --- a/src/test/java/com/teragrep/pth10/TableTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/TableTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -60,29 +60,27 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** - * Tests for the new ProcessingStack implementation - * Uses streaming datasets + * Tests for the new ProcessingStack implementation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class TableTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TableTransformationTest.class); private final String testFile = "src/test/resources/regexTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -102,74 +100,73 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void table_test_1() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat _time \"\" _time2 | table _time*", - testFile, - ds -> { - assertEquals("[_time, _time2]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); + streamingTestUtil.performDPLTest("index=index_A | strcat _time \"\" _time2 | table _time*", testFile, ds -> { + assertEquals( + "[_time, _time2]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void table_test_2() { - streamingTestUtil.performDPLTest( - "index=index_A | table index, offset", - testFile, - ds -> { - assertEquals("[index, offset]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); + streamingTestUtil.performDPLTest("index=index_A | table index, offset", testFile, ds -> { + assertEquals( + "[index, offset]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void table_test_3() { - streamingTestUtil.performDPLTest( - "index=index_A | table _time offset index", - testFile, - ds -> { - assertEquals("[_time, offset, index]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); + streamingTestUtil.performDPLTest("index=index_A | table _time offset index", testFile, ds -> { + assertEquals( + "[_time, offset, index]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + }); } // dangling meta character '?' issue @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void table_test_4_issue283() { - streamingTestUtil.performDPLTest( - "index=index_A | table \"?????\"", - testFile, - ds -> { - assertEquals("[]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); + streamingTestUtil.performDPLTest("index=index_A | table \"?????\"", testFile, ds -> { + assertEquals( + "[]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void table_test_5() { - streamingTestUtil.performDPLTest( - "index=index_A | strcat _time \"\" _time2 | table *ime*", - testFile, - ds -> { - assertEquals("[_time, _time2]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - } - ); + streamingTestUtil.performDPLTest("index=index_A | strcat _time \"\" _time2 | table *ime*", testFile, ds -> { + assertEquals( + "[_time, _time2]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + }); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/TeragrepDynatraceTest.java b/src/test/java/com/teragrep/pth10/TeragrepDynatraceTest.java index aa1b479..9f7aac7 100644 --- a/src/test/java/com/teragrep/pth10/TeragrepDynatraceTest.java +++ b/src/test/java/com/teragrep/pth10/TeragrepDynatraceTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -66,27 +66,25 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class TeragrepDynatraceTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepDynatraceTest.class); private final String testFile = "src/test/resources/IplocationTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; private ClientAndServer mockServer; private final int port = 9001; - @org.junit.jupiter.api.BeforeAll void setEnv() { this.streamingTestUtil = new StreamingTestUtil(this.testSchema); @@ -111,55 +109,49 @@ void stopServer() { mockServer.stop(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgDynatraceTest() { // respond to metrics ingest - mockServer.when( - request() - .withPath("/metrics/ingest") - .withMethod("POST") - .withHeader("Content-Type", "text/plain; charset=utf-8") - ) - .respond(HttpClassCallback.callback(DynatraceTestAPICallback.class)); + mockServer + .when(request().withPath("/metrics/ingest").withMethod("POST").withHeader("Content-Type", "text/plain; charset=utf-8")).respond(HttpClassCallback.callback(DynatraceTestAPICallback.class)); // send post - this.streamingTestUtil.performDPLTest( - "index=* " + - "| stats count(_raw) avg(_raw) by sourcetype " + - "| teragrep exec dynatrace metric write", - testFile, ds -> { - }); + this.streamingTestUtil + .performDPLTest( + "index=* " + "| stats count(_raw) avg(_raw) by sourcetype " + + "| teragrep exec dynatrace metric write", + testFile, ds -> { + } + ); // two lines received mockServer.verify(request().withPath("/metrics/ingest"), VerificationTimes.exactly(2)); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgDynatraceNoAggregateTest() { // respond to metrics ingest - mockServer.when( - request() - .withPath("/metrics/ingest") - .withMethod("POST") - .withHeader("Content-Type", "text/plain; charset=utf-8") - ) - .respond(HttpClassCallback.callback(DynatraceTestAPICallback.class)); + mockServer + .when(request().withPath("/metrics/ingest").withMethod("POST").withHeader("Content-Type", "text/plain; charset=utf-8")).respond(HttpClassCallback.callback(DynatraceTestAPICallback.class)); // send post - Throwable th = - this.streamingTestUtil.performThrowingDPLTest( - RuntimeException.class, - "index=* " + - "| teragrep exec dynatrace metric write", - testFile, ds -> { - }); + Throwable th = this.streamingTestUtil + .performThrowingDPLTest( + RuntimeException.class, "index=* " + "| teragrep exec dynatrace metric write", testFile, ds -> { + } + ); // should not work without aggregate assertTrue(th.getMessage().endsWith("requires a preceding aggregate!")); @@ -169,27 +161,23 @@ public void tgDynatraceNoAggregateTest() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgDynatraceNonNumericDataTest() { // respond to metrics ingest - mockServer.when( - request() - .withPath("/metrics/ingest") - .withMethod("POST") - .withHeader("Content-Type", "text/plain; charset=utf-8") - ) - .respond(HttpClassCallback.callback(DynatraceTestAPICallback.class)); + mockServer + .when(request().withPath("/metrics/ingest").withMethod("POST").withHeader("Content-Type", "text/plain; charset=utf-8")).respond(HttpClassCallback.callback(DynatraceTestAPICallback.class)); // send post - Throwable th = - this.streamingTestUtil.performThrowingDPLTest( - StreamingQueryException.class, - "| makeresults count=10 " + - "| eval _raw = \"string\"" + - "| stats sum(_raw)" + - "| teragrep exec dynatrace metric write", - testFile, ds -> { - }); + Throwable th = this.streamingTestUtil + .performThrowingDPLTest( + StreamingQueryException.class, "| makeresults count=10 " + "| eval _raw = \"string\"" + + "| stats sum(_raw)" + "| teragrep exec dynatrace metric write", + testFile, ds -> { + } + ); // should not work with non-numeric data assertEquals("Non-numeric text was provided!", th.getCause().getMessage()); @@ -198,4 +186,3 @@ public void tgDynatraceNonNumericDataTest() { mockServer.verify(request().withPath("/metrics/ingest"), VerificationTimes.never()); } } - diff --git a/src/test/java/com/teragrep/pth10/TeragrepKafkaTest.java b/src/test/java/com/teragrep/pth10/TeragrepKafkaTest.java index 3858cb8..b01d180 100644 --- a/src/test/java/com/teragrep/pth10/TeragrepKafkaTest.java +++ b/src/test/java/com/teragrep/pth10/TeragrepKafkaTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10; import com.salesforce.kafka.test.junit5.SharedKafkaTestResource; @@ -82,25 +81,24 @@ public class TeragrepKafkaTest { private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepKafkaTest.class); private final String testFile = "src/test/resources/IplocationTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; // automatically started and stopped via the annotation @RegisterExtension - public static final SharedKafkaTestResource sharedKafkaTestResource = new SharedKafkaTestResource().registerListener(new PlainListener().onPorts(42649)); + public static final SharedKafkaTestResource sharedKafkaTestResource = new SharedKafkaTestResource() + .registerListener(new PlainListener().onPorts(42649)); @org.junit.jupiter.api.BeforeAll void setEnv() { @@ -114,7 +112,11 @@ void setUp() { // Create config for kafka HashMap map = new HashMap<>(); - map.put("dpl.pth_10.transform.teragrep.kafka.save.bootstrap.servers", sharedKafkaTestResource.getKafkaConnectString().split("//")[1]); + map + .put( + "dpl.pth_10.transform.teragrep.kafka.save.bootstrap.servers", + sharedKafkaTestResource.getKafkaConnectString().split("//")[1] + ); map.put("dpl.pth_10.transform.teragrep.kafka.save.security.protocol", "PLAINTEXT"); map.put("dpl.pth_10.transform.teragrep.kafka.save.sasl.mechanism", "SASL_PLAINTEXT"); map.put("fs.s3a.access.key", "empty"); @@ -138,7 +140,10 @@ void tearDown() { // test teragrep exec kafka save @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void teragrepKafkaSaveTest() { String topic = "topic1"; String query = "index=index_A | teragrep exec kafka save " + topic; @@ -147,11 +152,14 @@ public void teragrepKafkaSaveTest() { LOGGER.info("Consumer dataset : <{}>", ds.schema()); // Create kafka consumer - try (final KafkaConsumer kafkaConsumer = - sharedKafkaTestResource.getKafkaTestUtils().getKafkaConsumer(StringDeserializer.class, StringDeserializer.class)) { + try ( + final KafkaConsumer kafkaConsumer = sharedKafkaTestResource + .getKafkaTestUtils() + .getKafkaConsumer(StringDeserializer.class, StringDeserializer.class) + ) { final List topicPartitionList = new ArrayList<>(); - for (final PartitionInfo partitionInfo: kafkaConsumer.partitionsFor(topic)) { + for (final PartitionInfo partitionInfo : kafkaConsumer.partitionsFor(topic)) { topicPartitionList.add(new TopicPartition(partitionInfo.topic(), partitionInfo.partition())); } kafkaConsumer.assign(topicPartitionList); @@ -163,7 +171,7 @@ public void teragrepKafkaSaveTest() { do { records = kafkaConsumer.poll(2000L); - for (ConsumerRecord record: records) { + for (ConsumerRecord record : records) { // Assert that there are correct values in kafka assertTrue(record.value().contains("\"source\":\"" + "127." + i + "." + i + "." + i + "\"")); i++; @@ -176,7 +184,13 @@ public void teragrepKafkaSaveTest() { } // test the returned dataset - List offsets = ds.select("offset").orderBy("id").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List offsets = ds + .select("offset") + .orderBy("id") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); List actualOffsets = Arrays.asList("1", "2", "3", "4", "5"); assertEquals(actualOffsets, offsets); }); @@ -184,7 +198,10 @@ public void teragrepKafkaSaveTest() { // test with an aggregation before the command @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void teragrepKafkaSaveTest_aggregation() { String topic = "topic2"; String query = "index=index_A | stats count by offset | teragrep exec kafka save " + topic; @@ -196,11 +213,14 @@ public void teragrepKafkaSaveTest_aggregation() { this.streamingTestUtil.getCtx().flush(); // Create kafka consumer - try (final KafkaConsumer kafkaConsumer = - sharedKafkaTestResource.getKafkaTestUtils().getKafkaConsumer(StringDeserializer.class, StringDeserializer.class)) { + try ( + final KafkaConsumer kafkaConsumer = sharedKafkaTestResource + .getKafkaTestUtils() + .getKafkaConsumer(StringDeserializer.class, StringDeserializer.class) + ) { final List topicPartitionList = new ArrayList<>(); - for (final PartitionInfo partitionInfo: kafkaConsumer.partitionsFor(topic)) { + for (final PartitionInfo partitionInfo : kafkaConsumer.partitionsFor(topic)) { topicPartitionList.add(new TopicPartition(partitionInfo.topic(), partitionInfo.partition())); } kafkaConsumer.assign(topicPartitionList); @@ -212,7 +232,7 @@ public void teragrepKafkaSaveTest_aggregation() { do { records = kafkaConsumer.poll(2000L); - for (ConsumerRecord record: records) { + for (ConsumerRecord record : records) { // Assert that there are correct values in kafka (all offsets) for (int j = 1; j < 6; j++) { assertTrue(record.value().contains("\"offset\":" + j)); @@ -228,7 +248,13 @@ public void teragrepKafkaSaveTest_aggregation() { } // test the returned dataset - List offsets = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List offsets = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); List actualOffsets = Arrays.asList("1", "2", "3", "4", "5"); assertEquals(actualOffsets, offsets); }); @@ -236,10 +262,14 @@ public void teragrepKafkaSaveTest_aggregation() { // test with two aggregations before the command @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void teragrepKafkaSaveTest_twoAggregations() { String topic = "topic3"; - String query = "index=index_A | stats count by offset | stats count by offset | teragrep exec kafka save " + topic; + String query = "index=index_A | stats count by offset | stats count by offset | teragrep exec kafka save " + + topic; this.streamingTestUtil.performDPLTest(query, this.testFile, ds -> { LOGGER.info("Consumer dataset : <{}>", ds.schema()); @@ -247,11 +277,14 @@ public void teragrepKafkaSaveTest_twoAggregations() { this.streamingTestUtil.getCtx().flush(); // Create kafka consumer - try (final KafkaConsumer kafkaConsumer = - sharedKafkaTestResource.getKafkaTestUtils().getKafkaConsumer(StringDeserializer.class, StringDeserializer.class)) { + try ( + final KafkaConsumer kafkaConsumer = sharedKafkaTestResource + .getKafkaTestUtils() + .getKafkaConsumer(StringDeserializer.class, StringDeserializer.class) + ) { final List topicPartitionList = new ArrayList<>(); - for (final PartitionInfo partitionInfo: kafkaConsumer.partitionsFor(topic)) { + for (final PartitionInfo partitionInfo : kafkaConsumer.partitionsFor(topic)) { topicPartitionList.add(new TopicPartition(partitionInfo.topic(), partitionInfo.partition())); } kafkaConsumer.assign(topicPartitionList); @@ -263,7 +296,7 @@ public void teragrepKafkaSaveTest_twoAggregations() { do { records = kafkaConsumer.poll(2000L); - for (ConsumerRecord record: records) { + for (ConsumerRecord record : records) { // Assert that there are correct values in kafka (all offsets) for (int j = 1; j < 6; j++) { assertTrue(record.value().contains("\"offset\":" + j)); @@ -278,7 +311,13 @@ public void teragrepKafkaSaveTest_twoAggregations() { } // test the returned dataset - List offsets = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List offsets = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); List actualOffsets = Arrays.asList("1", "2", "3", "4", "5"); assertEquals(actualOffsets, offsets); }); @@ -286,7 +325,10 @@ public void teragrepKafkaSaveTest_twoAggregations() { // test with a "sequential_only" before the command @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void teragrepKafkaSaveTest_sequential() { String topic = "topic4"; String query = "index=index_A | sort num(offset) | teragrep exec kafka save " + topic; @@ -297,11 +339,14 @@ public void teragrepKafkaSaveTest_sequential() { this.streamingTestUtil.getCtx().flush(); // Create kafka consumer - try (final KafkaConsumer kafkaConsumer = - sharedKafkaTestResource.getKafkaTestUtils().getKafkaConsumer(StringDeserializer.class, StringDeserializer.class)) { + try ( + final KafkaConsumer kafkaConsumer = sharedKafkaTestResource + .getKafkaTestUtils() + .getKafkaConsumer(StringDeserializer.class, StringDeserializer.class) + ) { final List topicPartitionList = new ArrayList<>(); - for (final PartitionInfo partitionInfo: kafkaConsumer.partitionsFor(topic)) { + for (final PartitionInfo partitionInfo : kafkaConsumer.partitionsFor(topic)) { topicPartitionList.add(new TopicPartition(partitionInfo.topic(), partitionInfo.partition())); } kafkaConsumer.assign(topicPartitionList); @@ -313,7 +358,7 @@ public void teragrepKafkaSaveTest_sequential() { do { records = kafkaConsumer.poll(2000L); - for (ConsumerRecord record: records) { + for (ConsumerRecord record : records) { // Assert that there are correct values in kafka (test the source column) assertTrue(record.value().contains("\"source\":\"" + "127." + i + "." + i + "." + i + "\"")); i++; @@ -326,7 +371,13 @@ public void teragrepKafkaSaveTest_sequential() { } // test the returned dataset - List offsets = ds.select("offset").orderBy("id").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List offsets = ds + .select("offset") + .orderBy("id") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); List actualOffsets = Arrays.asList("1", "2", "3", "4", "5"); assertEquals(actualOffsets, offsets); }); diff --git a/src/test/java/com/teragrep/pth10/TeragrepTransformationTest.java b/src/test/java/com/teragrep/pth10/TeragrepTransformationTest.java index c6c06f0..f7c688e 100644 --- a/src/test/java/com/teragrep/pth10/TeragrepTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/TeragrepTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -66,23 +66,22 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class TeragrepTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepTransformationTest.class); private final String testFile = "src/test/resources/IplocationTransformationTest_data*.json"; // * to make the path into a directory path private String testResourcesPath; - private final StructType testSchema = new StructType( - new StructField[]{ - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -103,336 +102,481 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsSaveLoadTest() { final String id = UUID.randomUUID().toString(); - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + + " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, + testFile, ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); + } + ); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsSaveLoadCsvTest() { final String id = UUID.randomUUID().toString(); - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + "/ format=CSV" + " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id + " format=CSV", - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + "/ format=CSV" + + " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id + + " format=CSV", + testFile, ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); + } + ); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsLoadEmptyAvroDatasetTest() { final String id = UUID.randomUUID().toString(); - this.streamingTestUtil.performDPLTest( - "index=index_B | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + "/ format=avro" + " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id + " format=avro", - this.testFile, - ds -> { - Assertions.assertTrue(ds.isEmpty()); - } - ); + this.streamingTestUtil + .performDPLTest( + "index=index_B | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + "/ format=avro" + + " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id + + " format=avro", + this.testFile, ds -> { + Assertions.assertTrue(ds.isEmpty()); + } + ); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsLoadEmptyCsvDatasetTest() { final String id = UUID.randomUUID().toString(); - this.streamingTestUtil.performDPLTest( - "index=index_B | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + "/ format=CSV" + " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id + " format=CSV", - this.testFile, - ds -> { - Assertions.assertTrue(ds.isEmpty()); - } - ); + this.streamingTestUtil + .performDPLTest( + "index=index_B | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + "/ format=CSV" + + " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id + + " format=CSV", + this.testFile, ds -> { + Assertions.assertTrue(ds.isEmpty()); + } + ); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsLoadCustomCsvTest() { String dir = testResourcesPath.concat("/csv/hdfs.csv"); if (!Files.exists(Paths.get(dir))) { fail("Expected file does not exist: " + dir); } - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load " + dir + " format=CSV header=FALSE", - testFile, - ds -> { - List listOfResult = ds.select("_raw").orderBy("_raw").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList( - "2023-01-01T00:00:00z,stuff,1", - "2023-01-02T00:00:00z,other stuff,2", - "2023-01-03T00:00:00z,more other stuff,3", - "2023-01-04T00:00:00z,even more stuff,4", - "2023-01-05T00:00:00z,more otherer stuff,5", - "_time,_raw,offset"), - listOfResult); - } - ); + streamingTestUtil + .performDPLTest("| teragrep exec hdfs load " + dir + " format=CSV header=FALSE", testFile, ds -> { + List listOfResult = ds + .select("_raw") + .orderBy("_raw") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals( + Arrays + .asList( + "2023-01-01T00:00:00z,stuff,1", "2023-01-02T00:00:00z,other stuff,2", + "2023-01-03T00:00:00z,more other stuff,3", + "2023-01-04T00:00:00z,even more stuff,4", + "2023-01-05T00:00:00z,more otherer stuff,5", "_time,_raw,offset" + ), + listOfResult + ); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsLoadCustomCsvWithHeaderTest() { String dir = testResourcesPath.concat("/csv/hdfs.csv"); if (!Files.exists(Paths.get(dir))) { fail("Expected file does not exist: " + dir); } - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load " + dir + " format=CSV header=TRUE", - testFile, - ds -> { - List listOfResult = ds.select("_raw").orderBy("_raw").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList( - "2023-01-01T00:00:00z,stuff,1", - "2023-01-02T00:00:00z,other stuff,2", - "2023-01-03T00:00:00z,more other stuff,3", - "2023-01-04T00:00:00z,even more stuff,4", - "2023-01-05T00:00:00z,more otherer stuff,5", - "_time,_raw,offset"), - listOfResult); - } - ); + streamingTestUtil + .performDPLTest("| teragrep exec hdfs load " + dir + " format=CSV header=TRUE", testFile, ds -> { + List listOfResult = ds + .select("_raw") + .orderBy("_raw") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals( + Arrays + .asList( + "2023-01-01T00:00:00z,stuff,1", "2023-01-02T00:00:00z,other stuff,2", + "2023-01-03T00:00:00z,more other stuff,3", + "2023-01-04T00:00:00z,even more stuff,4", + "2023-01-05T00:00:00z,more otherer stuff,5", "_time,_raw,offset" + ), + listOfResult + ); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsLoadCustomCsvWithProvidedSchemaTest() { String dir = testResourcesPath.concat("/csv/hdfs.csv"); if (!Files.exists(Paths.get(dir))) { fail("Expected file does not exist: " + dir); } - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load " + dir + " format=CSV header=TRUE schema=\"_time,_raw,offset\"", - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - } - ); + streamingTestUtil + .performDPLTest( + "| teragrep exec hdfs load " + dir + " format=CSV header=TRUE schema=\"_time,_raw,offset\"", + testFile, ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); + } + ); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsSaveLoadWildcardTest() { final String id = UUID.randomUUID().toString(); - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + "/" + id + - " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id + "/*", - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + "/" + id + + " | regex _raw = \"\" | teragrep exec hdfs load /tmp/pth_10_hdfs/" + id + "/*", + testFile, ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); + } + ); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsListTest() { - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs list ./src/test/resources/hdfslist/*", - testFile, - ds -> { - List listOfResult = ds.select("name").orderBy("name").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("another_dummy_file.txt", "dummy_file.txt"), listOfResult); - } - ); + streamingTestUtil.performDPLTest("| teragrep exec hdfs list ./src/test/resources/hdfslist/*", testFile, ds -> { + List listOfResult = ds + .select("name") + .orderBy("name") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("another_dummy_file.txt", "dummy_file.txt"), listOfResult); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsListWildcardTest() { - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs list ./src/test/resources/hdfslist/*.txt", - testFile, - ds -> { - List listOfResult = ds.select("name").orderBy("name").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + streamingTestUtil + .performDPLTest("| teragrep exec hdfs list ./src/test/resources/hdfslist/*.txt", testFile, ds -> { + List listOfResult = ds + .select("name") + .orderBy("name") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(Arrays.asList("another_dummy_file.txt", "dummy_file.txt"), listOfResult); - } - ); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsListInvalidPathTest() { - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs list /tmp/this/path/does/not/exist", - testFile, - ds -> { - List listOfResult = ds.select("name").orderBy("name").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.emptyList(), listOfResult); - } - ); + streamingTestUtil.performDPLTest("| teragrep exec hdfs list /tmp/this/path/does/not/exist", testFile, ds -> { + List listOfResult = ds + .select("name") + .orderBy("name") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.emptyList(), listOfResult); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsSaveAfterBloomEstimateTest() { final String id = UUID.randomUUID().toString(); - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec tokenizer | teragrep exec bloom estimate | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("estimate(tokens)").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("5"), listOfResult); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | teragrep exec tokenizer | teragrep exec bloom estimate | teragrep exec hdfs save /tmp/pth_10_hdfs/" + + id, + testFile, ds -> { + List listOfResult = ds + .select("estimate(tokens)") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("5"), listOfResult); + } + ); this.streamingTestUtil.setUp(); - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("estimate(tokens)").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("5"), listOfResult); - } - ); + streamingTestUtil.performDPLTest("| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, testFile, ds -> { + List listOfResult = ds + .select("estimate(tokens)") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("5"), listOfResult); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsSaveAfterAggregateTest() { final String id = UUID.randomUUID().toString(); - streamingTestUtil.performDPLTest( - "index=index_A | stats avg(offset) AS avg_offset | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("avg_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("3.0"), listOfResult); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | stats avg(offset) AS avg_offset | teragrep exec hdfs save /tmp/pth_10_hdfs/" + + id, + testFile, ds -> { + List listOfResult = ds + .select("avg_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("3.0"), listOfResult); + } + ); this.streamingTestUtil.setUp(); - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("avg_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("3.0"), listOfResult); - } - ); + streamingTestUtil.performDPLTest("| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, testFile, ds -> { + List listOfResult = ds + .select("avg_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("3.0"), listOfResult); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsSaveAfterTwoAggregationsTest() { final String id = UUID.randomUUID().toString(); - streamingTestUtil.performDPLTest( - "index=index_A | stats avg(offset) AS avg_offset | stats values(avg_offset) AS offset_values" + - " | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("offset_values").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("3.0"), listOfResult); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | stats avg(offset) AS avg_offset | stats values(avg_offset) AS offset_values" + + " | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id, + testFile, ds -> { + List listOfResult = ds + .select("offset_values") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("3.0"), listOfResult); + } + ); this.streamingTestUtil.setUp(); - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("offset_values").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("3.0"), listOfResult); - } - ); + streamingTestUtil.performDPLTest("| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, testFile, ds -> { + List listOfResult = ds + .select("offset_values") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("3.0"), listOfResult); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsSaveAfterSequentialTest() { final String id = UUID.randomUUID().toString(); - streamingTestUtil.performDPLTest( - "index=index_A | sort num(offset) | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | sort num(offset) | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id, testFile, + ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); + } + ); this.streamingTestUtil.setUp(); - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - } - ); + streamingTestUtil.performDPLTest("| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, testFile, ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsSaveTest() { final String id = UUID.randomUUID().toString(); - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + streamingTestUtil + .performDPLTest("index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id, testFile, ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - } - ); + }); this.streamingTestUtil.setUp(); - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - } - ); + streamingTestUtil.performDPLTest("| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, testFile, ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgHdfsSaveOverwriteTest() { final String id = UUID.randomUUID().toString(); - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + " overwrite=true", - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | teragrep exec hdfs save /tmp/pth_10_hdfs/" + id + " overwrite=true", testFile, + ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); + } + ); this.streamingTestUtil.setUp(); - streamingTestUtil.performDPLTest( - "| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); - }); + streamingTestUtil.performDPLTest("| teragrep exec hdfs load /tmp/pth_10_hdfs/" + id, testFile, ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), listOfResult); + }); } @Test - @DisabledIfSystemProperty(named = "skipSparkTest", matches = "true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tgGetArchiveSummaryTest() { - streamingTestUtil.performDPLTest( - "| teragrep get archive summary index=* offset < 3", - testFile, - ds -> { - List listOfResult = ds.select("offset").orderBy("offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("1","2"), listOfResult); - }); + streamingTestUtil.performDPLTest("| teragrep get archive summary index=* offset < 3", testFile, ds -> { + List listOfResult = ds + .select("offset") + .orderBy("offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Arrays.asList("1", "2"), listOfResult); + }); } } - diff --git a/src/test/java/com/teragrep/pth10/TimechartStreamingTest.java b/src/test/java/com/teragrep/pth10/TimechartStreamingTest.java index f58a074..a1f76a1 100644 --- a/src/test/java/com/teragrep/pth10/TimechartStreamingTest.java +++ b/src/test/java/com/teragrep/pth10/TimechartStreamingTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,29 +63,27 @@ import static org.junit.jupiter.api.Assertions.assertTrue; /** - * Tests for the new ProcessingStack implementation - * Uses streaming datasets + * Tests for the new ProcessingStack implementation Uses streaming datasets + * * @author eemhu - * */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class TimechartStreamingTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TimechartStreamingTest.class); private final String testFile = "src/test/resources/dedup_test_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -105,101 +103,129 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void timechartStreamingTest_1() { - streamingTestUtil.performDPLTest( - "index=index_A earliest=2020-01-01T00:00:00z latest=2021-01-01T00:00:00z | timechart span=1mon count(_raw) as craw by sourcetype", - testFile, - ds -> { - assertEquals("[_time, sourcetype, craw]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - List listOfTime = ds.select("_time").collectAsList(); - - // span buckets one per month (one extra due to timezones) - assertEquals(13, listOfTime.size()); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A earliest=2020-01-01T00:00:00z latest=2021-01-01T00:00:00z | timechart span=1mon count(_raw) as craw by sourcetype", + testFile, ds -> { + assertEquals( + "[_time, sourcetype, craw]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + + List listOfTime = ds.select("_time").collectAsList(); + + // span buckets one per month (one extra due to timezones) + assertEquals(13, listOfTime.size()); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void timechartStreamingTest_1b() { - streamingTestUtil.performDPLTest( - "index=index_A earliest=2020-12-12T00:00:00z latest=2020-12-12T00:30:00z | timechart span=1min count(_raw) as craw by sourcetype", - testFile, - ds -> { - assertEquals("[_time, sourcetype, craw]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - List listOfTime = ds.select("_time").collectAsList(); - - // span buckets one per minute for 30mins - assertEquals(31, listOfTime.size()); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A earliest=2020-12-12T00:00:00z latest=2020-12-12T00:30:00z | timechart span=1min count(_raw) as craw by sourcetype", + testFile, ds -> { + assertEquals( + "[_time, sourcetype, craw]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + + List listOfTime = ds.select("_time").collectAsList(); + + // span buckets one per minute for 30mins + assertEquals(31, listOfTime.size()); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void timechartStreamingTest_2() { - streamingTestUtil.performDPLTest( - "index=index_A | timechart span=1min count(_raw) as craw by sourcetype", - testFile, - ds -> { - assertEquals("[_time, sourcetype, craw]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - List listOfSourcetype = - ds.select("sourcetype").na().drop("any") - .dropDuplicates().collectAsList().stream().map(r->r.getAs(0).toString()).filter(str->!str.equals("0")) + streamingTestUtil + .performDPLTest( + "index=index_A | timechart span=1min count(_raw) as craw by sourcetype", testFile, ds -> { + assertEquals( + "[_time, sourcetype, craw]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + + List listOfSourcetype = ds + .select("sourcetype") + .na() + .drop("any") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .filter(str -> !str.equals("0")) .collect(Collectors.toList()); - assertTrue(listOfSourcetype.contains("stream1") && listOfSourcetype.contains("stream2")); - assertEquals(2, listOfSourcetype.size()); - } - ); + assertTrue(listOfSourcetype.contains("stream1") && listOfSourcetype.contains("stream2")); + assertEquals(2, listOfSourcetype.size()); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void timechartStreamingTest_3() { - streamingTestUtil.performDPLTest( - "index=index_A | timechart count by host", - testFile, - ds -> { - assertEquals("[_time, host, count]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - List listOfHosts = ds.select("host").dropDuplicates().collectAsList().stream().map(r->r.getAs(0).toString()) - .filter(str -> !str.equals("0")).collect(Collectors.toList()); - - assertEquals(1, listOfHosts.size()); - } - ); + streamingTestUtil.performDPLTest("index=index_A | timechart count by host", testFile, ds -> { + assertEquals( + "[_time, host, count]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + + List listOfHosts = ds + .select("host") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .filter(str -> !str.equals("0")) + .collect(Collectors.toList()); + + assertEquals(1, listOfHosts.size()); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void timechartStreamingTest_4() { - streamingTestUtil.performDPLTest( - "index=index_A | timechart count", - testFile, - ds -> { - assertEquals("[_time, count]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - List listOfCount = ds.select("count").dropDuplicates().collectAsList().stream().map(r -> r.getAs(0).toString()) - .filter(str->!str.equals("0")).collect(Collectors.toList()); - - assertEquals(1, listOfCount.size()); - assertEquals("10", listOfCount.get(0)); - } - ); + streamingTestUtil.performDPLTest("index=index_A | timechart count", testFile, ds -> { + assertEquals( + "[_time, count]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + + List listOfCount = ds + .select("count") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .filter(str -> !str.equals("0")) + .collect(Collectors.toList()); + + assertEquals(1, listOfCount.size()); + assertEquals("10", listOfCount.get(0)); + }); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/TokenizerTest.java b/src/test/java/com/teragrep/pth10/TokenizerTest.java index 567ae50..7877ceb 100644 --- a/src/test/java/com/teragrep/pth10/TokenizerTest.java +++ b/src/test/java/com/teragrep/pth10/TokenizerTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -60,22 +60,21 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class TokenizerTest { + private static final Logger LOGGER = LoggerFactory.getLogger(RexTransformationTest.class); private final String testFile = "src/test/resources/rexTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -100,41 +99,47 @@ void tearDown() { // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tokenize() { - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec tokenizer", - testFile, - ds -> { - assertEquals("tokens", ds.columns()[ds.columns().length-1]); - }); + streamingTestUtil.performDPLTest("index=index_A | teragrep exec tokenizer", testFile, ds -> { + assertEquals("tokens", ds.columns()[ds.columns().length - 1]); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tokenize2() { - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec tokenizer format string input _raw output strtokens", - testFile, - ds -> { - String row = ds.select("strtokens").first().getList(0).toString(); - assertTrue(row.startsWith("[{, \", rainfall")); - assertEquals("strtokens", ds.columns()[ds.columns().length-1]); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | teragrep exec tokenizer format string input _raw output strtokens", testFile, + ds -> { + String row = ds.select("strtokens").first().getList(0).toString(); + assertTrue(row.startsWith("[{, \", rainfall")); + assertEquals("strtokens", ds.columns()[ds.columns().length - 1]); + } + ); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void tokenize3() { - streamingTestUtil.performDPLTest( - "index=index_A | teragrep exec tokenizer format bytes input _raw output bytetokens", - testFile, - ds -> { - String row = ds.select("bytetokens").first().getList(0).toString(); - assertTrue(row.startsWith("[[B")); // bytes start with '[[B' - assertEquals("bytetokens", ds.columns()[ds.columns().length-1]); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | teragrep exec tokenizer format bytes input _raw output bytetokens", testFile, + ds -> { + String row = ds.select("bytetokens").first().getList(0).toString(); + assertTrue(row.startsWith("[[B")); // bytes start with '[[B' + assertEquals("bytetokens", ds.columns()[ds.columns().length - 1]); + } + ); } } - - diff --git a/src/test/java/com/teragrep/pth10/UnimplementedCommandTest.java b/src/test/java/com/teragrep/pth10/UnimplementedCommandTest.java index 1202bc0..2fb98b9 100644 --- a/src/test/java/com/teragrep/pth10/UnimplementedCommandTest.java +++ b/src/test/java/com/teragrep/pth10/UnimplementedCommandTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -59,22 +59,21 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class UnimplementedCommandTest { + private static final Logger LOGGER = LoggerFactory.getLogger(UnimplementedCommandTest.class); private final String testFile = "src/test/resources/rexTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -99,20 +98,28 @@ void tearDown() { // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void unimplementedCommandTest() { final String query = "index=index_A | dump basefilename=\"test\""; - IllegalArgumentException iae = this.streamingTestUtil.performThrowingDPLTest(IllegalArgumentException.class, query, testFile, ds -> {}); + IllegalArgumentException iae = this.streamingTestUtil + .performThrowingDPLTest(IllegalArgumentException.class, query, testFile, ds -> { + }); assertEquals("The provided command 'dumpbasefilename=\"test\"' is not yet implemented.", iae.getMessage()); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void unimplementedCommandAfterAggregateTest() { final String query = "index=index_A | stats count | dump basefilename=\"test\""; - IllegalArgumentException iae = this.streamingTestUtil.performThrowingDPLTest(IllegalArgumentException.class, query, testFile, ds -> {}); + IllegalArgumentException iae = this.streamingTestUtil + .performThrowingDPLTest(IllegalArgumentException.class, query, testFile, ds -> { + }); assertEquals("The provided command 'dumpbasefilename=\"test\"' is not yet implemented.", iae.getMessage()); } } - - diff --git a/src/test/java/com/teragrep/pth10/UnquotedTextTest.java b/src/test/java/com/teragrep/pth10/UnquotedTextTest.java index 5cd125f..26e3c17 100644 --- a/src/test/java/com/teragrep/pth10/UnquotedTextTest.java +++ b/src/test/java/com/teragrep/pth10/UnquotedTextTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10; import com.teragrep.pth10.ast.TextString; diff --git a/src/test/java/com/teragrep/pth10/WhereTransformationTest.java b/src/test/java/com/teragrep/pth10/WhereTransformationTest.java index a129da0..04dad89 100644 --- a/src/test/java/com/teragrep/pth10/WhereTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/WhereTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -62,22 +62,21 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class WhereTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(WhereTransformationTest.class); private final String testFile = "src/test/resources/rex4jTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -102,66 +101,82 @@ void tearDown() { // ---------------------------------------- @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void WhereLikeWildcardTest() { - streamingTestUtil.performDPLTest( - "index=index_A | where _raw like \"%rainfall_rate%\"", - testFile, - ds -> { - // get extracted column data - List rawColumn = - ds.select("_raw").dropDuplicates() - .collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - assertEquals(1, rawColumn.size()); - }); + streamingTestUtil.performDPLTest("index=index_A | where _raw like \"%rainfall_rate%\"", testFile, ds -> { + // get extracted column data + List rawColumn = ds + .select("_raw") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + assertEquals(1, rawColumn.size()); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void WhereLikeSameTextTest() { - streamingTestUtil.performDPLTest( - "index=index_A | where index like \"index_A\"", - testFile, - ds -> { - // get extracted column data - List indexColumn = ds.select("index").dropDuplicates() - .collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - assertEquals(1, indexColumn.size()); - }); + streamingTestUtil.performDPLTest("index=index_A | where index like \"index_A\"", testFile, ds -> { + // get extracted column data + List indexColumn = ds + .select("index") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + assertEquals(1, indexColumn.size()); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void WhereLikeNoMatchTest() { - streamingTestUtil.performDPLTest( - "index=index_A | where index like \"index_A_\"", - testFile, - ds -> { - // get extracted column data - List indexColumn = ds.select("index").dropDuplicates() - .collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - assertEquals(0, indexColumn.size()); - }); + streamingTestUtil.performDPLTest("index=index_A | where index like \"index_A_\"", testFile, ds -> { + // get extracted column data + List indexColumn = ds + .select("index") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + assertEquals(0, indexColumn.size()); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void WhereLikeUnderscoreTest() { - streamingTestUtil.performDPLTest( - "index=index_A | where host like \"_ost\"", - testFile, - ds -> { - // get extracted column data - List hostColumn = ds.select("host").dropDuplicates() - .collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - assertEquals(1, hostColumn.size()); - }); + streamingTestUtil.performDPLTest("index=index_A | where host like \"_ost\"", testFile, ds -> { + // get extracted column data + List hostColumn = ds + .select("host") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + assertEquals(1, hostColumn.size()); + }); } } - - - diff --git a/src/test/java/com/teragrep/pth10/XmlkvTransformationTest.java b/src/test/java/com/teragrep/pth10/XmlkvTransformationTest.java index 5fcfc4b..9e0bc9f 100644 --- a/src/test/java/com/teragrep/pth10/XmlkvTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/XmlkvTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,28 +61,26 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** - * Tests for xmlkv command - * Uses streaming datasets + * Tests for xmlkv command Uses streaming datasets * * @author eemhu */ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class XmlkvTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(XmlkvTransformationTest.class); - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); private StreamingTestUtil streamingTestUtil; @@ -102,7 +100,6 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // ---------------------------------------- // Tests // ---------------------------------------- @@ -113,66 +110,66 @@ void tearDown() { final String INVALID_DATA = "src/test/resources/xmlkv/xmlkv_inv*.json"; @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void xmlkvTest0() { - streamingTestUtil.performDPLTest( - "index=index_A | xmlkv _raw", - XML_DATA_2, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, item, something]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - - String result = ds - .select("item", "something") - .dropDuplicates() - .collectAsList() - .stream() - .map(r -> r.getAs(0).toString() - .concat(";") - .concat(r.getAs(1).toString()) - ) - .collect(Collectors.toList()).get(0); - assertEquals("b;123", result); - } - ); + streamingTestUtil.performDPLTest("index=index_A | xmlkv _raw", XML_DATA_2, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, item, something]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + + String result = ds + .select("item", "something") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString().concat(";").concat(r.getAs(1).toString())) + .collect(Collectors.toList()) + .get(0); + assertEquals("b;123", result); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void xmlkvTest1() { - streamingTestUtil.performDPLTest( - "index=index_A | xmlkv _raw", - XML_DATA_1, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset, item]", - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - - String result = ds - .select("item") - .dropDuplicates() - .collectAsList() - .stream() - .map( - r -> r.getAs(0).toString() - ) - .collect(Collectors.toList()).get(0); - assertEquals("Hello world", result); - } - ); + streamingTestUtil.performDPLTest("index=index_A | xmlkv _raw", XML_DATA_1, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset, item]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); + + String result = ds + .select("item") + .dropDuplicates() + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()) + .get(0); + assertEquals("Hello world", result); + }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void xmlkvTest2() { - streamingTestUtil.performDPLTest( - "index=index_A | xmlkv _raw", - INVALID_DATA, - ds -> { - // invalid data does not generate a result; only checking column arrangement - // to be the same as the input data. - assertEquals(Arrays.toString(testSchema.fieldNames()), - Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - } - ); + streamingTestUtil + .performDPLTest( + "index=index_A | xmlkv _raw", INVALID_DATA, ds -> { + // invalid data does not generate a result; only checking column arrangement + // to be the same as the input data. + assertEquals(Arrays.toString(testSchema.fieldNames()), Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); + } + ); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/chartTransformationTest.java b/src/test/java/com/teragrep/pth10/chartTransformationTest.java index dde9518..5cd3301 100644 --- a/src/test/java/com/teragrep/pth10/chartTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/chartTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -75,408 +75,461 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class chartTransformationTest { - private static final Logger LOGGER = LoggerFactory.getLogger(chartTransformationTest.class); - - String testFile = "src/test/resources/xmlWalkerTestDataStreaming/xmlWalkerTestDataStreaming*"; - SparkSession spark = null; - StreamingTestUtil streamingTestUtil; - - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(); - this.streamingTestUtil.setEnv(); - } - - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } - - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } - - - // transformation operation test for count - // index =* |chart count(_raw) by host - @Disabled(value="Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseChartcountTest() throws AnalysisException { - String q = "index = cinnamon | chart count(_raw) as count"; - String e = "SELECT count(_raw) AS count FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\""; - String result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled(value="Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseChartCountColumNameTest() throws AnalysisException { - String q,e,result; - // Define column name for count - q = "index=cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw_) as cnt"; - - try { - long indexEarliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = "SELECT count(_raw_) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch+")"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } catch (ParseException exception) { - fail(exception.getMessage()); - } - } - - @Disabled(value="Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseChartCountWithLogicalOperationAndColumNameTest() throws AnalysisException { - String q,e,result; - // logical AND-part and named column - q = "index=cinnamon _index_earliest=\"04/16/2020:10:25:40\" _index_latest=\"04/16/2020:10:25:42\" | chart count(_raw) as count by timestamp"; - - try { - long indexEarliestEpoch2 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - long indexLatestEpoch2 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = "SELECT timestamp,count(_raw) AS count FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch2+") AND _time <= from_unixtime("+indexLatestEpoch2+") GROUP BY timestamp"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } catch (ParseException exception) { - fail(exception.getMessage()); - } - } - - @Disabled(value="Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseChartCountDefaultNameTest() throws AnalysisException { - String q,e,result; - // Test autogenerated column names - q = "index = cinnamon | chart count(_raw) by host"; - e = "SELECT host,count(_raw) AS `count(_raw)` FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" GROUP BY host"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled(value="Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseChartCountDefaultName1Test() throws AnalysisException { - String q,e,result; - q = "index=cinnamon _index_earliest=\"04/16/2020:10:25:40\" _index_latest=\"04/16/2020:10:25:42\" | chart count(_raw)"; - - try { - long earliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - long latestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:42"); - e = "SELECT count(_raw) AS `count(_raw)` FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+earliestEpoch+") AND _time <= from_unixtime("+latestEpoch+")"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } catch (ParseException exception) { - fail(exception.getMessage()); - } - } - - @Disabled(value="Should be converted to a dataframe test") - @Test - public void parseChainedTransformationTest() { - String q = "index=fs_mon host=\"$form.host$\" sourcetype=\"fs:mon-01:pid-cpu:0\" earliest=\"09/24/2018:00:00:00\" latest=\"9/24/2018:04:00:00\" | where 'usr-ms'!=\"184467440737095516160\" | where 'system-ms'!=\"184467440737095516160\" | eval ProcessWithPID=Command+\"@\"+PID | timechart useother=f sum(usr-ms) by ProcessWithPID"; - String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND GROUP BY ProcessWithPID"; - String result = null; - assertEquals(e,result,q); - } - - - @Disabled(value="Should be converted to a dataframe test") - @Test - public void streamListWithTimeLimitsTest() { - String q = "earliest=-24h AND ( index = strawberry AND (sourcetype = \"example:strawberry:strawberry\" ) AND ( host = \"loadbalancer.example.com\" ) OR ( index = * AND host = \"firewall.example.com\" AND earliest = -90d Denied))"; - String e = "SELECT * FROM `strawberry` WHERE index LIKE \"strawberry\" AND _time >= from_unixtime(1618144982) AND ( AND (sourcetype LIKE \"example:strawberry:strawberry\") AND (host LIKE \"loadbalancer.example.com\") OR ( AND host LIKE \"firewall.example.com\" AND timestamp >= from_unixtime(1610455382) AND _raw LIKE '%Denied%'))"; - String result = null; - assertEquals(e,result,q); - } - - - @Disabled(value="Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void countParsingTest() throws AnalysisException { - String q,e,result; - q = "index=cpu sourcetype=log:cpu:0 (host=sc-99-99-11-48 OR host=sc-99-99-13-164) | chart count(_raw) as cnt by host"; - e = "SELECT host,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cpu\" AND sourcetype LIKE \"log:cpu:0\" AND (host LIKE \"sc-99-99-11-48\" OR host LIKE \"sc-99-99-13-164\") GROUP BY host"; - result = utils.getQueryAnalysis(q); - assertEquals(e, result,q); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void endToEnd2Test() { - String q = "( index = index_A OR index = index_B ) _index_earliest=\"04/16/2003:10:25:40\" | chart count(_raw) as count by offset"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - - String e = "[offset: bigint, count: bigint]"; // At least schema is correct - assertEquals(e, res.toString()); - - // 3 first rows are earlier than where _index_earliest is set to - List expectedValues = new ArrayList<>(); - for (int i = 4; i < 11; i++) { - expectedValues.add(i + ",1"); - } - - List resultList = res.collectAsList().stream().map(r -> r.mkString(",")).collect(Collectors.toList()); - - // sort the lists, as the order of rows doesn't matter with this aggregation - Collections.sort(expectedValues); - Collections.sort(resultList); - - assertEquals(expectedValues, resultList); - boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); - assertTrue(aggregates); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void endToEnd3Test() { - String q = "index = index_A _index_earliest=\"04/16/2003:10:25:40\" | chart count(_raw) as count by offset | where count > 0"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - - String e = "[offset: bigint, count: bigint]"; // At least schema is correct - assertEquals(e, res.toString()); - - List expectedValues = new ArrayList<>(); - // Only first 5 rows have index: index_A - // and only the latter 2 have _time after index_earliest - expectedValues.add(4 + ",1"); - expectedValues.add(5 + ",1"); - - List resultList = res.collectAsList().stream().map(r -> r.mkString(",")).collect(Collectors.toList()); - - // sort the lists, as the order of rows doesn't matter with this aggregation - Collections.sort(expectedValues); - Collections.sort(resultList); - - assertEquals(expectedValues, resultList); - res.printSchema(); - boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); - assertTrue(aggregates); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void endToEnd4Test() { - String q = "index = index_B _index_earliest=\"04/16/2003:10:25:40\" | chart count(_raw)"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - - String e = "[count(_raw): bigint]"; // At least schema is correct - assertEquals(e, res.toString()); - - List expectedValues = new ArrayList<>(); - expectedValues.add("5"); // only last 5 rows have index: index_B - - List resultList = res.collectAsList().stream().map(r -> r.mkString(",")).collect(Collectors.toList()); - - // sort the lists, as the order of rows doesn't matter with this aggregation - Collections.sort(expectedValues); - Collections.sort(resultList); - - assertEquals(expectedValues, resultList); - res.printSchema(); - boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); - assertTrue(aggregates); - }); - } - - // multiple chart aggregations - // specifically for issue #184 - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void chart_multipleAggs_issue184_Test() { - String q = "index=* | chart count(_raw), min(offset), max(offset) by index"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - final StructType expectedSchema = new StructType( - new StructField[] { - new StructField("index", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("count(_raw)", DataTypes.LongType, true, new MetadataBuilder().build()), - new StructField("min(offset)", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("max(offset)", DataTypes.StringType, true, new MetadataBuilder().build()) - } - ); - - assertEquals(expectedSchema, res.schema()); - - // assert contents - List count = res.select("count(_raw)").collectAsList(); - List min = res.select("min(offset)").collectAsList(); - List max = res.select("max(offset)").collectAsList(); - - Row cr = count.get(0); - Row minr = min.get(0); - Row maxr = max.get(0); - - Row cr2 = count.get(1); - Row minr2 = min.get(1); - Row maxr2 = max.get(1); - - assertEquals("5",cr.getAs(0).toString()); - assertEquals("1",minr.getAs(0).toString()); - assertEquals("5",maxr.getAs(0).toString()); - - assertEquals("5",cr2.getAs(0).toString()); - assertEquals("6",minr2.getAs(0).toString()); - assertEquals("10",maxr2.getAs(0).toString()); - }); - } - - // Check that is AggregatesUsed returns true - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void endToEnd5Test() { - String q = "index = jla02logger | chart count(_raw)"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); - assertTrue(aggregates); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void testSplittingByTime() { - String q = "index=* | chart avg(offset) by _time"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - final StructType expectedSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("avg(offset)", DataTypes.DoubleType, true, new MetadataBuilder().build()), - } - ); - - assertEquals(expectedSchema, res.schema()); - - List time = res.select("_time").collectAsList(); - List offset = res.select("avg(offset)").collectAsList(); - - System.out.println(time.stream().map(r -> r.getAs(0).toString()).toArray()); - - // assert correct ordering, old to new - String[] expectedTime = new String[]{ - "2001-01-01T01:01:01.010+03:00", "2002-02-02T02:02:02.020+03:00", - "2003-03-03T03:03:03.030+03:00", "2004-04-04T04:04:04.040+03:00", - "2005-05-05T05:05:05.050+03:00", "2006-06-06T06:06:06.060+03:00", - "2007-07-07T07:07:07.070+03:00", "2008-08-08T08:08:08.080+03:00", - "2009-09-09T09:09:09.090+03:00", "2010-10-10T10:10:10.100+03:00" - }; - String[] expectedOffset = new String[]{ "1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", "10.0" }; - - assertArrayEquals(expectedTime, time.stream().map(r -> r.getAs(0).toString()).toArray()); - assertArrayEquals(expectedOffset, offset.stream().map(r -> r.getAs(0).toString()).toArray()); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void testSplittingByString() { - String q = "index=* | chart avg(offset) by sourcetype"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - final StructType expectedSchema = new StructType( - new StructField[] { - new StructField("sourcetype", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("avg(offset)", DataTypes.DoubleType, true, new MetadataBuilder().build()), - } - ); - - assertEquals(expectedSchema, res.schema()); - - List sourcetype = res.select("sourcetype").collectAsList(); - List offset = res.select("avg(offset)").collectAsList(); - - // ascending ordering for strings - String[] expectedSourcetype = new String[]{"A:X:0", "A:Y:0", "B:X:0", "B:Y:0"}; - String[] expectedOffset = new String[]{"1.5", "4.0", "7.0", "9.5"}; - - assertArrayEquals(expectedSourcetype, sourcetype.stream().map(r -> r.getAs(0).toString()).toArray()); - assertArrayEquals(expectedOffset, offset.stream().map(r -> r.getAs(0).toString()).toArray()); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void testSplittingByNumber() { - String q = "index=* | chart count(offset) by offset"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - final StructType expectedSchema = new StructType( - new StructField[] { - new StructField("offset", DataTypes.LongType, true, new MetadataBuilder().build()), - new StructField("count(offset)", DataTypes.LongType, true, new MetadataBuilder().build()), - } - ); - - assertEquals(expectedSchema, res.schema()); - - List offset = res.select("offset").collectAsList(); - List count = res.select("count(offset)").collectAsList(); - - // assert correct ascending ordering - String[] expectedOffset = new String[]{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10"}; - String[] expectedCount = new String[]{"1", "1", "1", "1", "1", "1", "1", "1", "1", "1"}; - - assertArrayEquals(expectedOffset, offset.stream().map(r -> r.getAs(0).toString()).toArray()); - assertArrayEquals(expectedCount, count.stream().map(r -> r.getAs(0).toString()).toArray()); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void testSplittingByNumericalString() { - String q = "index=* | eval a = offset + 0 | chart count(offset) by a"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - final StructType expectedSchema = new StructType( - new StructField[] { - new StructField("a", DataTypes.StringType, true, new MetadataBuilder().build()), - new StructField("count(offset)", DataTypes.LongType, true, new MetadataBuilder().build()), - } - ); - - assertEquals(expectedSchema, res.schema()); - - List a = res.select("a").collectAsList(); - List count = res.select("count(offset)").collectAsList(); - - // assert correct ascending ordering - String[] expectedA = new String[]{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10"}; - String[] expectedCount = new String[]{"1", "1", "1", "1", "1", "1", "1", "1", "1", "1"}; - - assertArrayEquals(expectedA, a.stream().map(r -> r.getAs(0).toString()).toArray()); - assertArrayEquals(expectedCount, count.stream().map(r -> r.getAs(0).toString()).toArray()); - }); - } - - @Disabled(value="Should be converted to a dataframe test") - @Test - void endToEnd7Test() { - String q; - // First parse incoming DPL - // check that timechart returns also aggregatesUsed=true - q = "index = jla02logger | timechart span=1m count(_raw) by host"; - CharStream inputStream = CharStreams.fromString(q); - DPLLexer lexer = new DPLLexer(inputStream); - DPLParser parser = new DPLParser(new CommonTokenStream(lexer)); - ParseTree tree = parser.root(); - - DPLParserCatalystContext ctx = new DPLParserCatalystContext(spark); - // Use this file for dataset initialization - String testFile = "src/test/resources/xmlWalkerTestData.json"; - Dataset inDs = spark.read().json(testFile); - ctx.setDs(inDs); - ctx.setEarliest("-1Y"); - DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); - - CatalystNode n = (CatalystNode) visitor.visit(tree); - boolean aggregates = visitor.getAggregatesUsed(); - assertTrue(aggregates,visitor.getTraceBuffer().toString()); - } -} \ No newline at end of file + + private static final Logger LOGGER = LoggerFactory.getLogger(chartTransformationTest.class); + + String testFile = "src/test/resources/xmlWalkerTestDataStreaming/xmlWalkerTestDataStreaming*"; + SparkSession spark = null; + StreamingTestUtil streamingTestUtil; + + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(); + this.streamingTestUtil.setEnv(); + } + + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } + + // transformation operation test for count + // index =* |chart count(_raw) by host + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseChartcountTest() throws AnalysisException { + String q = "index = cinnamon | chart count(_raw) as count"; + String e = "SELECT count(_raw) AS count FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\""; + String result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseChartCountColumNameTest() throws AnalysisException { + String q, e, result; + // Define column name for count + q = "index=cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw_) as cnt"; + + try { + long indexEarliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = "SELECT count(_raw_) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch + ")"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + catch (ParseException exception) { + fail(exception.getMessage()); + } + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseChartCountWithLogicalOperationAndColumNameTest() throws AnalysisException { + String q, e, result; + // logical AND-part and named column + q = "index=cinnamon _index_earliest=\"04/16/2020:10:25:40\" _index_latest=\"04/16/2020:10:25:42\" | chart count(_raw) as count by timestamp"; + + try { + long indexEarliestEpoch2 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + long indexLatestEpoch2 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = "SELECT timestamp,count(_raw) AS count FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch2 + ") AND _time <= from_unixtime(" + indexLatestEpoch2 + + ") GROUP BY timestamp"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + catch (ParseException exception) { + fail(exception.getMessage()); + } + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseChartCountDefaultNameTest() throws AnalysisException { + String q, e, result; + // Test autogenerated column names + q = "index = cinnamon | chart count(_raw) by host"; + e = "SELECT host,count(_raw) AS `count(_raw)` FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" GROUP BY host"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseChartCountDefaultName1Test() throws AnalysisException { + String q, e, result; + q = "index=cinnamon _index_earliest=\"04/16/2020:10:25:40\" _index_latest=\"04/16/2020:10:25:42\" | chart count(_raw)"; + + try { + long earliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + long latestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:42"); + e = "SELECT count(_raw) AS `count(_raw)` FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + earliestEpoch + ") AND _time <= from_unixtime(" + latestEpoch + ")"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + catch (ParseException exception) { + fail(exception.getMessage()); + } + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test + public void parseChainedTransformationTest() { + String q = "index=fs_mon host=\"$form.host$\" sourcetype=\"fs:mon-01:pid-cpu:0\" earliest=\"09/24/2018:00:00:00\" latest=\"9/24/2018:04:00:00\" | where 'usr-ms'!=\"184467440737095516160\" | where 'system-ms'!=\"184467440737095516160\" | eval ProcessWithPID=Command+\"@\"+PID | timechart useother=f sum(usr-ms) by ProcessWithPID"; + String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND GROUP BY ProcessWithPID"; + String result = null; + assertEquals(e, result, q); + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test + public void streamListWithTimeLimitsTest() { + String q = "earliest=-24h AND ( index = strawberry AND (sourcetype = \"example:strawberry:strawberry\" ) AND ( host = \"loadbalancer.example.com\" ) OR ( index = * AND host = \"firewall.example.com\" AND earliest = -90d Denied))"; + String e = "SELECT * FROM `strawberry` WHERE index LIKE \"strawberry\" AND _time >= from_unixtime(1618144982) AND ( AND (sourcetype LIKE \"example:strawberry:strawberry\") AND (host LIKE \"loadbalancer.example.com\") OR ( AND host LIKE \"firewall.example.com\" AND timestamp >= from_unixtime(1610455382) AND _raw LIKE '%Denied%'))"; + String result = null; + assertEquals(e, result, q); + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void countParsingTest() throws AnalysisException { + String q, e, result; + q = "index=cpu sourcetype=log:cpu:0 (host=sc-99-99-11-48 OR host=sc-99-99-13-164) | chart count(_raw) as cnt by host"; + e = "SELECT host,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cpu\" AND sourcetype LIKE \"log:cpu:0\" AND (host LIKE \"sc-99-99-11-48\" OR host LIKE \"sc-99-99-13-164\") GROUP BY host"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result, q); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void endToEnd2Test() { + String q = "( index = index_A OR index = index_B ) _index_earliest=\"04/16/2003:10:25:40\" | chart count(_raw) as count by offset"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + + String e = "[offset: bigint, count: bigint]"; // At least schema is correct + assertEquals(e, res.toString()); + + // 3 first rows are earlier than where _index_earliest is set to + List expectedValues = new ArrayList<>(); + for (int i = 4; i < 11; i++) { + expectedValues.add(i + ",1"); + } + + List resultList = res + .collectAsList() + .stream() + .map(r -> r.mkString(",")) + .collect(Collectors.toList()); + + // sort the lists, as the order of rows doesn't matter with this aggregation + Collections.sort(expectedValues); + Collections.sort(resultList); + + assertEquals(expectedValues, resultList); + boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); + assertTrue(aggregates); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void endToEnd3Test() { + String q = "index = index_A _index_earliest=\"04/16/2003:10:25:40\" | chart count(_raw) as count by offset | where count > 0"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + + String e = "[offset: bigint, count: bigint]"; // At least schema is correct + assertEquals(e, res.toString()); + + List expectedValues = new ArrayList<>(); + // Only first 5 rows have index: index_A + // and only the latter 2 have _time after index_earliest + expectedValues.add(4 + ",1"); + expectedValues.add(5 + ",1"); + + List resultList = res + .collectAsList() + .stream() + .map(r -> r.mkString(",")) + .collect(Collectors.toList()); + + // sort the lists, as the order of rows doesn't matter with this aggregation + Collections.sort(expectedValues); + Collections.sort(resultList); + + assertEquals(expectedValues, resultList); + res.printSchema(); + boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); + assertTrue(aggregates); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void endToEnd4Test() { + String q = "index = index_B _index_earliest=\"04/16/2003:10:25:40\" | chart count(_raw)"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + + String e = "[count(_raw): bigint]"; // At least schema is correct + assertEquals(e, res.toString()); + + List expectedValues = new ArrayList<>(); + expectedValues.add("5"); // only last 5 rows have index: index_B + + List resultList = res + .collectAsList() + .stream() + .map(r -> r.mkString(",")) + .collect(Collectors.toList()); + + // sort the lists, as the order of rows doesn't matter with this aggregation + Collections.sort(expectedValues); + Collections.sort(resultList); + + assertEquals(expectedValues, resultList); + res.printSchema(); + boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); + assertTrue(aggregates); + }); + } + + // multiple chart aggregations + // specifically for issue #184 + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void chart_multipleAggs_issue184_Test() { + String q = "index=* | chart count(_raw), min(offset), max(offset) by index"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + final StructType expectedSchema = new StructType(new StructField[] { + new StructField("index", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("count(_raw)", DataTypes.LongType, true, new MetadataBuilder().build()), + new StructField("min(offset)", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("max(offset)", DataTypes.StringType, true, new MetadataBuilder().build()) + }); + + assertEquals(expectedSchema, res.schema()); + + // assert contents + List count = res.select("count(_raw)").collectAsList(); + List min = res.select("min(offset)").collectAsList(); + List max = res.select("max(offset)").collectAsList(); + + Row cr = count.get(0); + Row minr = min.get(0); + Row maxr = max.get(0); + + Row cr2 = count.get(1); + Row minr2 = min.get(1); + Row maxr2 = max.get(1); + + assertEquals("5", cr.getAs(0).toString()); + assertEquals("1", minr.getAs(0).toString()); + assertEquals("5", maxr.getAs(0).toString()); + + assertEquals("5", cr2.getAs(0).toString()); + assertEquals("6", minr2.getAs(0).toString()); + assertEquals("10", maxr2.getAs(0).toString()); + }); + } + + // Check that is AggregatesUsed returns true + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void endToEnd5Test() { + String q = "index = jla02logger | chart count(_raw)"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); + assertTrue(aggregates); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void testSplittingByTime() { + String q = "index=* | chart avg(offset) by _time"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + final StructType expectedSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("avg(offset)", DataTypes.DoubleType, true, new MetadataBuilder().build()), + }); + + assertEquals(expectedSchema, res.schema()); + + List time = res.select("_time").collectAsList(); + List offset = res.select("avg(offset)").collectAsList(); + + System.out.println(time.stream().map(r -> r.getAs(0).toString()).toArray()); + + // assert correct ordering, old to new + String[] expectedTime = new String[] { + "2001-01-01T01:01:01.010+03:00", + "2002-02-02T02:02:02.020+03:00", + "2003-03-03T03:03:03.030+03:00", + "2004-04-04T04:04:04.040+03:00", + "2005-05-05T05:05:05.050+03:00", + "2006-06-06T06:06:06.060+03:00", + "2007-07-07T07:07:07.070+03:00", + "2008-08-08T08:08:08.080+03:00", + "2009-09-09T09:09:09.090+03:00", + "2010-10-10T10:10:10.100+03:00" + }; + String[] expectedOffset = new String[] { + "1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", "10.0" + }; + + assertArrayEquals(expectedTime, time.stream().map(r -> r.getAs(0).toString()).toArray()); + assertArrayEquals(expectedOffset, offset.stream().map(r -> r.getAs(0).toString()).toArray()); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void testSplittingByString() { + String q = "index=* | chart avg(offset) by sourcetype"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + final StructType expectedSchema = new StructType(new StructField[] { + new StructField("sourcetype", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("avg(offset)", DataTypes.DoubleType, true, new MetadataBuilder().build()), + }); + + assertEquals(expectedSchema, res.schema()); + + List sourcetype = res.select("sourcetype").collectAsList(); + List offset = res.select("avg(offset)").collectAsList(); + + // ascending ordering for strings + String[] expectedSourcetype = new String[] { + "A:X:0", "A:Y:0", "B:X:0", "B:Y:0" + }; + String[] expectedOffset = new String[] { + "1.5", "4.0", "7.0", "9.5" + }; + + assertArrayEquals(expectedSourcetype, sourcetype.stream().map(r -> r.getAs(0).toString()).toArray()); + assertArrayEquals(expectedOffset, offset.stream().map(r -> r.getAs(0).toString()).toArray()); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void testSplittingByNumber() { + String q = "index=* | chart count(offset) by offset"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + final StructType expectedSchema = new StructType(new StructField[] { + new StructField("offset", DataTypes.LongType, true, new MetadataBuilder().build()), + new StructField("count(offset)", DataTypes.LongType, true, new MetadataBuilder().build()), + }); + + assertEquals(expectedSchema, res.schema()); + + List offset = res.select("offset").collectAsList(); + List count = res.select("count(offset)").collectAsList(); + + // assert correct ascending ordering + String[] expectedOffset = new String[] { + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10" + }; + String[] expectedCount = new String[] { + "1", "1", "1", "1", "1", "1", "1", "1", "1", "1" + }; + + assertArrayEquals(expectedOffset, offset.stream().map(r -> r.getAs(0).toString()).toArray()); + assertArrayEquals(expectedCount, count.stream().map(r -> r.getAs(0).toString()).toArray()); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void testSplittingByNumericalString() { + String q = "index=* | eval a = offset + 0 | chart count(offset) by a"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + final StructType expectedSchema = new StructType(new StructField[] { + new StructField("a", DataTypes.StringType, true, new MetadataBuilder().build()), + new StructField("count(offset)", DataTypes.LongType, true, new MetadataBuilder().build()), + }); + + assertEquals(expectedSchema, res.schema()); + + List a = res.select("a").collectAsList(); + List count = res.select("count(offset)").collectAsList(); + + // assert correct ascending ordering + String[] expectedA = new String[] { + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10" + }; + String[] expectedCount = new String[] { + "1", "1", "1", "1", "1", "1", "1", "1", "1", "1" + }; + + assertArrayEquals(expectedA, a.stream().map(r -> r.getAs(0).toString()).toArray()); + assertArrayEquals(expectedCount, count.stream().map(r -> r.getAs(0).toString()).toArray()); + }); + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test + void endToEnd7Test() { + String q; + // First parse incoming DPL + // check that timechart returns also aggregatesUsed=true + q = "index = jla02logger | timechart span=1m count(_raw) by host"; + CharStream inputStream = CharStreams.fromString(q); + DPLLexer lexer = new DPLLexer(inputStream); + DPLParser parser = new DPLParser(new CommonTokenStream(lexer)); + ParseTree tree = parser.root(); + + DPLParserCatalystContext ctx = new DPLParserCatalystContext(spark); + // Use this file for dataset initialization + String testFile = "src/test/resources/xmlWalkerTestData.json"; + Dataset inDs = spark.read().json(testFile); + ctx.setDs(inDs); + ctx.setEarliest("-1Y"); + DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); + + CatalystNode n = (CatalystNode) visitor.visit(tree); + boolean aggregates = visitor.getAggregatesUsed(); + assertTrue(aggregates, visitor.getTraceBuffer().toString()); + } +} diff --git a/src/test/java/com/teragrep/pth10/commandTest.java b/src/test/java/com/teragrep/pth10/commandTest.java index 53dde75..d199296 100644 --- a/src/test/java/com/teragrep/pth10/commandTest.java +++ b/src/test/java/com/teragrep/pth10/commandTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -64,6 +64,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class commandTest { + private static final Logger LOGGER = LoggerFactory.getLogger(commandTest.class); // Use this file for dataset initialization @@ -88,12 +89,15 @@ void tearDown() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void explainTest() { String q = "index=index_A sourcetype= A:X:0 | top limit=1 host | fields + host |explain "; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - StructType expectedSchema = new StructType(new StructField[]{ + StructType expectedSchema = new StructType(new StructField[] { StructField.apply("result", DataTypes.StringType, false, new MetadataBuilder().build()) }); List resAsList = res.collectAsList(); @@ -105,12 +109,15 @@ void explainTest() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void explain1Test() { String q = "index=index_A sourcetype= A:X:0 | top limit=1 host | fields + host |explain extended"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - StructType expectedSchema = new StructType(new StructField[]{ + StructType expectedSchema = new StructType(new StructField[] { StructField.apply("result", DataTypes.StringType, false, new MetadataBuilder().build()) }); List resAsList = res.collectAsList(); @@ -123,12 +130,15 @@ void explain1Test() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void explain2Test() { String q = "index = index_A [ search sourcetype= A:X:0 | top limit=3 host | fields + host]|explain extended"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - StructType expectedSchema = new StructType(new StructField[]{ + StructType expectedSchema = new StructType(new StructField[] { StructField.apply("result", DataTypes.StringType, false, new MetadataBuilder().build()) }); List resAsList = res.collectAsList(); @@ -141,21 +151,28 @@ void explain2Test() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void auditTest() { String q = "index = index_A [ search sourcetype= A:X:0 | top limit=3 host | fields + host] | explain extended"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { DPLAuditInformation ainf = this.streamingTestUtil.getCtx().getAuditInformation(); // Check auditInformation - assertEquals("TestUser",ainf.getUser()); - assertEquals(q,ainf.getQuery()); - assertEquals("Testing audit log",ainf.getReason()); + assertEquals("TestUser", ainf.getUser()); + assertEquals(q, ainf.getQuery()); + assertEquals("Testing audit log", ainf.getReason()); }); } + @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void teragrepTest() { String q = "index=index_A sourcetype= A:X:0 | top limit=1 host | fields + host | teragrep get system version"; @@ -175,9 +192,12 @@ void teragrepTest() { } }); } - + @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void teragrep_Issue149_Test() { String q = " | teragrep get system version"; @@ -200,7 +220,10 @@ void teragrep_Issue149_Test() { // TODO: change after pth_03 issue #115 is closed (dpl changed under teragrep command) @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void dplTest() { String q = "index = index_A [ search sourcetype= A:X:0 | top limit=3 host | fields + host]|dpl debug=parsetree subsearch=true"; @@ -210,7 +233,10 @@ void dplTest() { // TODO: change after pth_03 issue #115 is closed (dpl changed under teragrep command) @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void dpl2Test() { String q = "index = index_A [ search sourcetype= A:X:0 | top limit=3 host | fields + host]|dpl debug=parsetree subsearch=false"; @@ -220,7 +246,10 @@ void dpl2Test() { // TODO: change after pth_03 issue #115 is closed (dpl changed under teragrep command) @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void dpl3Test() { String q = "index = index_A [ search sourcetype= A:X:0 | top limit=3 host | fields + host]|dpl debug=parsetree"; @@ -230,7 +259,10 @@ void dpl3Test() { // TODO: change after pth_03 issue #115 is closed (dpl changed under teragrep command) @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void dpl4Test() { String q = "index = index_A [ search sourcetype= A:X:0 | top limit=3 host | fields + host] [ search sourcetype= c:X:0| top limit=1 host | fields + host] |dpl debug=parsetree subsearch=true"; @@ -239,4 +271,3 @@ void dpl4Test() { } } - diff --git a/src/test/java/com/teragrep/pth10/evalTest.java b/src/test/java/com/teragrep/pth10/evalTest.java index 6dcbe9a..e9dadca 100644 --- a/src/test/java/com/teragrep/pth10/evalTest.java +++ b/src/test/java/com/teragrep/pth10/evalTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 diff --git a/src/test/java/com/teragrep/pth10/fieldTransformationTest.java b/src/test/java/com/teragrep/pth10/fieldTransformationTest.java index 2101820..e69872c 100644 --- a/src/test/java/com/teragrep/pth10/fieldTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/fieldTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -62,139 +62,163 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class fieldTransformationTest { - private static final Logger LOGGER = LoggerFactory.getLogger(fieldTransformationTest.class); - - // Use this file for dataset initialization - String testFile = "src/test/resources/xmlWalkerTestDataStreaming"; - private StreamingTestUtil streamingTestUtil; - - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(); - this.streamingTestUtil.setEnv(); - } - - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } - - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } - - @Disabled(value="Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseFieldsTransformTest() { - String q = "index=cinnamon | fields meta.*"; - String e = "SELECT meta.* FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" )"; - String result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); - utils.printDebug(e,result); - assertEquals(e,result); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void parseFieldsTransformCatTest() { - String q = "index=index_B | fields _time"; - this.streamingTestUtil.performDPLTest(q, this.testFile, ds -> { - List expectedValues = new ArrayList<>(); - expectedValues.add("2006-06-06T06:06:06.060+03:00"); - expectedValues.add("2007-07-07T07:07:07.070+03:00"); - expectedValues.add("2008-08-08T08:08:08.080+03:00"); - expectedValues.add("2009-09-09T09:09:09.090+03:00"); - expectedValues.add("2010-10-10T10:10:10.100+03:00"); - - List dsAsList = ds.collectAsList().stream().map(r -> r.getString(0)).sorted().collect(Collectors.toList()); - Collections.sort(expectedValues); - - assertEquals(5, dsAsList.size()); - for (int i = 0; i < expectedValues.size(); i++) { - assertEquals(expectedValues.get(i), dsAsList.get(i)); - } - - assertEquals("[_time: string]", ds.toString()); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void parseFieldsTransformCat2Test() { - String q = "index=index_B | fields _time host"; - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - assertEquals(5, res.count()); - assertEquals("[_time: string, host: string]", res.toString()); - }); - } - - /* - _raw, _time, host, index, offset, partition, source, sourcetype - */ - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void parseFieldsTransformCatDropTest() { - this.streamingTestUtil.performDPLTest("index=index_B | fields - host", this.testFile, res -> { - assertEquals(5, res.count()); - // check that we drop only host-column - String schema = res.schema().toString(); - assertEquals("StructType(StructField(_raw,StringType,true),StructField(_time,StringType,true),StructField(id,LongType,true),StructField(index,StringType,true),StructField(offset,LongType,true),StructField(partition,StringType,true),StructField(source,StringType,true),StructField(sourcetype,StringType,true))", schema); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - void parseFieldsTransformCatDropSeveralTest() { - this.streamingTestUtil.performDPLTest("index=index_B | fields - host index partition", this.testFile, res -> { - assertEquals(5, res.count()); - String schema = res.schema().toString(); - assertEquals("StructType(StructField(_raw,StringType,true),StructField(_time,StringType,true),StructField(id,LongType,true),StructField(offset,LongType,true),StructField(source,StringType,true),StructField(sourcetype,StringType,true))", schema); - }); - } - - @Disabled(value="Should be converteed to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseFieldsTransform1Test() { - String q,e,result; - q = "index=cinnamon Denied | fields meta.*,_raw"; - e = "SELECT meta.*,_raw FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _raw LIKE '%Denied%' )"; - result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); - utils.printDebug(e,result); - assertEquals(e,result); - } - - @Disabled(value="Should be converteed to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseFieldsTransform2Test() { - String q,e,result; - - q = "index=cinnamon Denied Port | fields meta.*,_raw"; - e = "SELECT meta.*,_raw FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _raw LIKE '%Denied%' AND _raw LIKE '%Port%' )"; - result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); - utils.printDebug(e,result); - assertEquals(e,result); - } - - @Disabled(value="Should be converteed to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseFieldsTransformAddTest() { - String q,e,result; - - q = "index=cinnamon Denied Port | fields + meta.*,_raw"; - e = "SELECT meta.*,_raw FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _raw LIKE '%Denied%' AND _raw LIKE '%Port%' )"; - result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); - utils.printDebug(e,result); - assertEquals(e,result); - } - - @Disabled(value="Should be converteed to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseFieldsTransformDropTest() { - String q,e,result; - q = "index=cinnamon Denied Port | fields - meta.*, _raw"; - e = "SELECT DROPFIELDS(meta.*,_raw) FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _raw LIKE '%Denied%' AND _raw LIKE '%Port%' )"; - result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); - utils.printDebug(e,result); - assertEquals(e,result); - } + + private static final Logger LOGGER = LoggerFactory.getLogger(fieldTransformationTest.class); + + // Use this file for dataset initialization + String testFile = "src/test/resources/xmlWalkerTestDataStreaming"; + private StreamingTestUtil streamingTestUtil; + + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(); + this.streamingTestUtil.setEnv(); + } + + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseFieldsTransformTest() { + String q = "index=cinnamon | fields meta.*"; + String e = "SELECT meta.* FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" )"; + String result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); + utils.printDebug(e, result); + assertEquals(e, result); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void parseFieldsTransformCatTest() { + String q = "index=index_B | fields _time"; + this.streamingTestUtil.performDPLTest(q, this.testFile, ds -> { + List expectedValues = new ArrayList<>(); + expectedValues.add("2006-06-06T06:06:06.060+03:00"); + expectedValues.add("2007-07-07T07:07:07.070+03:00"); + expectedValues.add("2008-08-08T08:08:08.080+03:00"); + expectedValues.add("2009-09-09T09:09:09.090+03:00"); + expectedValues.add("2010-10-10T10:10:10.100+03:00"); + + List dsAsList = ds + .collectAsList() + .stream() + .map(r -> r.getString(0)) + .sorted() + .collect(Collectors.toList()); + Collections.sort(expectedValues); + + assertEquals(5, dsAsList.size()); + for (int i = 0; i < expectedValues.size(); i++) { + assertEquals(expectedValues.get(i), dsAsList.get(i)); + } + + assertEquals("[_time: string]", ds.toString()); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void parseFieldsTransformCat2Test() { + String q = "index=index_B | fields _time host"; + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + assertEquals(5, res.count()); + assertEquals("[_time: string, host: string]", res.toString()); + }); + } + + /* + _raw, _time, host, index, offset, partition, source, sourcetype + */ + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void parseFieldsTransformCatDropTest() { + this.streamingTestUtil.performDPLTest("index=index_B | fields - host", this.testFile, res -> { + assertEquals(5, res.count()); + // check that we drop only host-column + String schema = res.schema().toString(); + assertEquals( + "StructType(StructField(_raw,StringType,true),StructField(_time,StringType,true),StructField(id,LongType,true),StructField(index,StringType,true),StructField(offset,LongType,true),StructField(partition,StringType,true),StructField(source,StringType,true),StructField(sourcetype,StringType,true))", + schema + ); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + void parseFieldsTransformCatDropSeveralTest() { + this.streamingTestUtil.performDPLTest("index=index_B | fields - host index partition", this.testFile, res -> { + assertEquals(5, res.count()); + String schema = res.schema().toString(); + assertEquals( + "StructType(StructField(_raw,StringType,true),StructField(_time,StringType,true),StructField(id,LongType,true),StructField(offset,LongType,true),StructField(source,StringType,true),StructField(sourcetype,StringType,true))", + schema + ); + }); + } + + @Disabled(value = "Should be converteed to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseFieldsTransform1Test() { + String q, e, result; + q = "index=cinnamon Denied | fields meta.*,_raw"; + e = "SELECT meta.*,_raw FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _raw LIKE '%Denied%' )"; + result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); + utils.printDebug(e, result); + assertEquals(e, result); + } + + @Disabled(value = "Should be converteed to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseFieldsTransform2Test() { + String q, e, result; + + q = "index=cinnamon Denied Port | fields meta.*,_raw"; + e = "SELECT meta.*,_raw FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _raw LIKE '%Denied%' AND _raw LIKE '%Port%' )"; + result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); + utils.printDebug(e, result); + assertEquals(e, result); + } + + @Disabled(value = "Should be converteed to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseFieldsTransformAddTest() { + String q, e, result; + + q = "index=cinnamon Denied Port | fields + meta.*,_raw"; + e = "SELECT meta.*,_raw FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _raw LIKE '%Denied%' AND _raw LIKE '%Port%' )"; + result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); + utils.printDebug(e, result); + assertEquals(e, result); + } + + @Disabled(value = "Should be converteed to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseFieldsTransformDropTest() { + String q, e, result; + q = "index=cinnamon Denied Port | fields - meta.*, _raw"; + e = "SELECT DROPFIELDS(meta.*,_raw) FROM ( SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _raw LIKE '%Denied%' AND _raw LIKE '%Port%' )"; + result = Assertions.assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); + utils.printDebug(e, result); + assertEquals(e, result); + } } diff --git a/src/test/java/com/teragrep/pth10/indexQueryTest.java b/src/test/java/com/teragrep/pth10/indexQueryTest.java index 25031a7..33cf99b 100644 --- a/src/test/java/com/teragrep/pth10/indexQueryTest.java +++ b/src/test/java/com/teragrep/pth10/indexQueryTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -53,23 +53,23 @@ public class indexQueryTest { - @Disabled(value="Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseSimpleIndexQueryTest() { - String q,e,result; - q = "index=cinnamon"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\""; - result = assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); - assertEquals(e,result); - } + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseSimpleIndexQueryTest() { + String q, e, result; + q = "index=cinnamon"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\""; + result = assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); + assertEquals(e, result); + } - @Disabled(value="Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void parseIndexQueryWithSearchStringTest() { - String q,e,result; - q = "index=kafka_topic conn"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%conn%'"; - result = assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); - assertEquals(e,result); - } -} \ No newline at end of file + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void parseIndexQueryWithSearchStringTest() { + String q, e, result; + q = "index=kafka_topic conn"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%conn%'"; + result = assertDoesNotThrow(() -> utils.getQueryAnalysis(q)); + assertEquals(e, result); + } +} diff --git a/src/test/java/com/teragrep/pth10/logicalOperationTest.java b/src/test/java/com/teragrep/pth10/logicalOperationTest.java index 1680eee..92b0ec2 100644 --- a/src/test/java/com/teragrep/pth10/logicalOperationTest.java +++ b/src/test/java/com/teragrep/pth10/logicalOperationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,392 +61,428 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class logicalOperationTest { - private static final Logger LOGGER = LoggerFactory.getLogger(logicalOperationTest.class); - // Use this file for dataset initialization - String testFile = "src/test/resources/xmlWalkerTestData*.json"; // * to make the path into a directory path - private StreamingTestUtil streamingTestUtil; + private static final Logger LOGGER = LoggerFactory.getLogger(logicalOperationTest.class); - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(); - this.streamingTestUtil.setEnv(); - } + // Use this file for dataset initialization + String testFile = "src/test/resources/xmlWalkerTestData*.json"; // * to make the path into a directory path + private StreamingTestUtil streamingTestUtil; - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(); + this.streamingTestUtil.setEnv(); + } - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseDPLTest() throws AnalysisException { + String q = "index=kafka_topic conn error eka OR toka kolmas"; + String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%conn%' AND _raw LIKE '%error%' AND _raw LIKE '%eka%' OR _raw LIKE '%toka%' AND _raw LIKE '%kolmas%'"; + String result = utils.getQueryAnalysis(q); + utils.printDebug(e, result); + assertEquals(e, result); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void parseDPLCatalystTest() { + String q = "index=kafka_topic *conn* *error* *eka* OR *toka* *kolmas*"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + String e = "(RLIKE(index, (?i)^kafka_topic$) AND (((RLIKE(_raw, (?i)^.*\\Qconn\\E.*) AND RLIKE(_raw, (?i)^.*\\Qerror\\E.*)) AND (RLIKE(_raw, (?i)^.*\\Qeka\\E.*) OR RLIKE(_raw, (?i)^.*\\Qtoka\\E.*))) AND RLIKE(_raw, (?i)^.*\\Qkolmas\\E.*)))"; + String result = this.streamingTestUtil.getCtx().getSparkQuery(); + assertEquals(e, result); + }); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseOrTest() throws AnalysisException { + String q = "index=kafka_topic a1 OR a2"; + String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%a1%' OR _raw LIKE '%a2%'"; + String result = utils.getQueryAnalysis(q); + utils.printDebug(e, result); + assertEquals(e, result); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void parseOrCatalystTest() { + String q = "index=kafka_topic a1 OR a2"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + String e = "(RLIKE(index, (?i)^kafka_topic$) AND (RLIKE(_raw, (?i)^.*\\Qa1\\E.*) OR RLIKE(_raw, (?i)^.*\\Qa2\\E.*)))"; + + String result = this.streamingTestUtil.getCtx().getSparkQuery(); + assertEquals(e, result); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void wildcardBloomCheckTest() { + String q = "index=xyz ab*"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + assertTrue(this.streamingTestUtil.getCtx().isWildcardSearchUsed()); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void wildcardBloomCheckTest2() { + String q = "index=xyz ab"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + assertFalse(this.streamingTestUtil.getCtx().isWildcardSearchUsed()); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void parseIndexInCatalystTest() { + String q = "index IN ( index_A index_B )"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + String e = "(RLIKE(index, (?i)^index_a) OR RLIKE(index, (?i)^index_b))"; + + String result = this.streamingTestUtil.getCtx().getSparkQuery(); + assertEquals(e, result); + }); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseAndTest() throws AnalysisException { + String q = "index=kafka_topic a1 AND a2"; + String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%a1%' AND _raw LIKE '%a2%'"; + String result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void parseAndCatalystTest() { + String q = "index=kafka_topic a1 AND a2"; + + this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { + String e = "(RLIKE(index, (?i)^kafka_topic$) AND (RLIKE(_raw, (?i)^.*\\Qa1\\E.*) AND RLIKE(_raw, (?i)^.*\\Qa2\\E.*)))"; + + String result = this.streamingTestUtil.getCtx().getSparkQuery(); + assertEquals(e, result); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void parseRawUUIDCatalystTest() { + String q = "index=abc sourcetype=\"cd:ef:gh:0\" \"1848c85bfe2c4323955dd5469f18baf6\""; + String testFile = "src/test/resources/uuidTestData*.json"; // * to make the path into a directory path + + this.streamingTestUtil.performDPLTest(q, testFile, res -> { + String e = "(RLIKE(index, (?i)^abc$) AND (RLIKE(sourcetype, (?i)^cd:ef:gh:0) AND RLIKE(_raw, (?i)^.*\\Q1848c85bfe2c4323955dd5469f18baf6\\E.*)))"; + String result = this.streamingTestUtil.getCtx().getSparkQuery(); + assertEquals(e, result); + + // Get raw field and check results. Should be only 1 match + Dataset selected = res.select("_raw"); + //selected.show(false); + List lst = selected + .collectAsList() + .stream() + .map(r -> r.getString(0)) + .sorted() + .collect(Collectors.toList()); + // check result count + assertEquals(3, lst.size()); + // Compare values + assertEquals("uuid=1848c85bfe2c4323955dd5469f18baf6 computer01.example.com", lst.get(1)); + assertEquals("uuid=1848c85bfe2c4323955dd5469f18baf6666 computer01.example.com", lst.get(2)); + assertEquals("uuid=*!<1848c85bFE2c4323955dd5469f18baf6< computer01.example.com", lst.get(0)); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void parseWithQuotesInsideQuotesCatalystTest() { + String q = "index=abc \"\\\"latitude\\\": -89.875, \\\"longitude\\\": 24.125\""; + String testFile = "src/test/resources/latitudeTestData*.json"; // * to make the path into a directory path + + this.streamingTestUtil.performDPLTest(q, testFile, res -> { + String e = "(RLIKE(index, (?i)^abc$) AND RLIKE(_raw, (?i)^.*\\Q\"latitude\": -89.875, \"longitude\": 24.125\\E.*))"; + String result = this.streamingTestUtil.getCtx().getSparkQuery(); + assertEquals(e, result); + + // Get raw field and check results. Should be only 1 match + Dataset selected = res.select("_raw"); + //selected.show(false); + List lst = selected.collectAsList(); + // check result count + assertEquals(2, lst.size()); + // Compare values + assertEquals("\"latitude\": -89.875, \"longitude\": 24.125", lst.get(0).getString(0)); + assertEquals("\"latitude\": -89.875, \"longitude\": 24.125", lst.get(1).getString(0)); + }); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseAnd1Test() throws AnalysisException { + String q, e, result; + q = "index=kafka_topic a1 a2"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%a1%' AND _raw LIKE '%a2%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseMultipleParenthesisTest() throws AnalysisException { + String q = "index=kafka_topic conn ( ( error AND toka) OR kolmas )"; + String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%conn%' AND ((_raw LIKE '%error%' AND _raw LIKE '%toka%') OR _raw LIKE '%kolmas%')"; + String result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseParenthesisWithOrTest() throws AnalysisException { + String q, e, result; + q = "index=kafka_topic conn AND ( ( error AND toka ) OR ( kolmas AND n4 ))"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%conn%' AND ((_raw LIKE '%error%' AND _raw LIKE '%toka%') OR (_raw LIKE '%kolmas%' AND _raw LIKE '%n4%'))"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseSimpleParenthesisTest() throws AnalysisException { + String q, e, result; + q = "index=kafka_topic ( conn )"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND (_raw LIKE '%conn%')"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseHostTest() throws AnalysisException { + String q = "index = archive_memory host = \"localhost\" Deny"; + String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"localhost\" AND _raw LIKE '%Deny%'"; + String result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseHost1Test() throws AnalysisException { + String q, e, result; + q = "index = archive_memory ( host = \"localhost\" OR host = \"test\" ) AND sourcetype = \"memory\" Deny"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND (host LIKE \"localhost\" OR host LIKE \"test\") AND sourcetype LIKE \"memory\" AND _raw LIKE '%Deny%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseHost2Test() throws AnalysisException { + String q, e, result; + q = "index = archive_memory host = \"localhost\" host = \"test\" host = \"test1\" Deny"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"localhost\" AND host LIKE \"test\" AND host LIKE \"test1\" AND _raw LIKE '%Deny%'"; + result = utils.getQueryAnalysis(q); + utils.printDebug(e, result); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseHost3Test() throws AnalysisException { + String q, e, result; + // missing AND in query + q = "index = archive_memory host = \"localhost\" host = \"test\" Deny"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"localhost\" AND host LIKE \"test\" AND _raw LIKE '%Deny%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseHost4Test() throws AnalysisException { + String q, e, result; + q = "index = archive_memory host = \"localhost\" host = \"test\" host = \"test1\" Deny"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"localhost\" AND host LIKE \"test\" AND host LIKE \"test1\" AND _raw LIKE '%Deny%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseHost5Test() throws AnalysisException { + String q, e, result; + // Same but missing AND in query + q = "index = archive_memory host = \"one\" host = \"two\" host = \"tree\" number"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"one\" AND host LIKE \"two\" AND host LIKE \"tree\" AND _raw LIKE '%number%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } + + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void streamListWithoutQuotesTest() throws AnalysisException { + String q, e, result; + q = "index = memory-test latest=\"05/10/2022:09:11:40\" host= sc-99-99-14-25 sourcetype= log:f17:0 Latitude"; + long latestEpoch = new DefaultTimeFormat().getEpoch("05/10/2022:09:11:40"); + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"memory-test\" AND _time <= from_unixtime(" + + latestEpoch + + ") AND host LIKE \"sc-99-99-14-25\" AND sourcetype LIKE \"log:f17:0\" AND _raw LIKE '%Latitude%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseDPLTest() throws AnalysisException { - String q = "index=kafka_topic conn error eka OR toka kolmas"; - String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%conn%' AND _raw LIKE '%error%' AND _raw LIKE '%eka%' OR _raw LIKE '%toka%' AND _raw LIKE '%kolmas%'"; - String result = utils.getQueryAnalysis(q); - utils.printDebug(e,result); - assertEquals(e,result); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void parseDPLCatalystTest() { - String q = "index=kafka_topic *conn* *error* *eka* OR *toka* *kolmas*"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - String e = "(RLIKE(index, (?i)^kafka_topic$) AND (((RLIKE(_raw, (?i)^.*\\Qconn\\E.*) AND RLIKE(_raw, (?i)^.*\\Qerror\\E.*)) AND (RLIKE(_raw, (?i)^.*\\Qeka\\E.*) OR RLIKE(_raw, (?i)^.*\\Qtoka\\E.*))) AND RLIKE(_raw, (?i)^.*\\Qkolmas\\E.*)))"; - String result = this.streamingTestUtil.getCtx().getSparkQuery(); - assertEquals(e, result); - }); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseOrTest() throws AnalysisException { - String q = "index=kafka_topic a1 OR a2"; - String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%a1%' OR _raw LIKE '%a2%'"; - String result = utils.getQueryAnalysis(q); - utils.printDebug(e,result); - assertEquals(e,result); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void parseOrCatalystTest() { - String q = "index=kafka_topic a1 OR a2"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - String e = "(RLIKE(index, (?i)^kafka_topic$) AND (RLIKE(_raw, (?i)^.*\\Qa1\\E.*) OR RLIKE(_raw, (?i)^.*\\Qa2\\E.*)))"; - - String result = this.streamingTestUtil.getCtx().getSparkQuery(); - assertEquals(e, result); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void wildcardBloomCheckTest() { - String q = "index=xyz ab*"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - assertTrue(this.streamingTestUtil.getCtx().isWildcardSearchUsed()); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void wildcardBloomCheckTest2() { - String q = "index=xyz ab"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - assertFalse(this.streamingTestUtil.getCtx().isWildcardSearchUsed()); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void parseIndexInCatalystTest() { - String q = "index IN ( index_A index_B )"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - String e = "(RLIKE(index, (?i)^index_a) OR RLIKE(index, (?i)^index_b))"; - - String result = this.streamingTestUtil.getCtx().getSparkQuery(); - assertEquals(e, result); - }); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseAndTest() throws AnalysisException { - String q = "index=kafka_topic a1 AND a2"; - String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%a1%' AND _raw LIKE '%a2%'"; - String result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void parseAndCatalystTest() { - String q = "index=kafka_topic a1 AND a2"; - - this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { - String e = "(RLIKE(index, (?i)^kafka_topic$) AND (RLIKE(_raw, (?i)^.*\\Qa1\\E.*) AND RLIKE(_raw, (?i)^.*\\Qa2\\E.*)))"; - - String result = this.streamingTestUtil.getCtx().getSparkQuery(); - assertEquals(e, result); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void parseRawUUIDCatalystTest() { - String q = "index=abc sourcetype=\"cd:ef:gh:0\" \"1848c85bfe2c4323955dd5469f18baf6\""; - String testFile = "src/test/resources/uuidTestData*.json"; // * to make the path into a directory path - - this.streamingTestUtil.performDPLTest(q, testFile, res -> { - String e = "(RLIKE(index, (?i)^abc$) AND (RLIKE(sourcetype, (?i)^cd:ef:gh:0) AND RLIKE(_raw, (?i)^.*\\Q1848c85bfe2c4323955dd5469f18baf6\\E.*)))"; - String result = this.streamingTestUtil.getCtx().getSparkQuery(); - assertEquals(e, result); - - // Get raw field and check results. Should be only 1 match - Dataset selected = res.select("_raw"); - //selected.show(false); - List lst = selected.collectAsList().stream().map(r->r.getString(0)).sorted().collect(Collectors.toList()); - // check result count - assertEquals(3,lst.size()); - // Compare values - assertEquals("uuid=1848c85bfe2c4323955dd5469f18baf6 computer01.example.com",lst.get(1)); - assertEquals("uuid=1848c85bfe2c4323955dd5469f18baf6666 computer01.example.com",lst.get(2)); - assertEquals("uuid=*!<1848c85bFE2c4323955dd5469f18baf6< computer01.example.com",lst.get(0)); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void parseWithQuotesInsideQuotesCatalystTest() { - String q = "index=abc \"\\\"latitude\\\": -89.875, \\\"longitude\\\": 24.125\""; - String testFile = "src/test/resources/latitudeTestData*.json"; // * to make the path into a directory path - - this.streamingTestUtil.performDPLTest(q, testFile, res -> { - String e = "(RLIKE(index, (?i)^abc$) AND RLIKE(_raw, (?i)^.*\\Q\"latitude\": -89.875, \"longitude\": 24.125\\E.*))"; - String result = this.streamingTestUtil.getCtx().getSparkQuery(); - assertEquals(e, result); - - // Get raw field and check results. Should be only 1 match - Dataset selected = res.select("_raw"); - //selected.show(false); - List lst = selected.collectAsList(); - // check result count - assertEquals(2, lst.size()); - // Compare values - assertEquals("\"latitude\": -89.875, \"longitude\": 24.125", lst.get(0).getString(0)); - assertEquals("\"latitude\": -89.875, \"longitude\": 24.125", lst.get(1).getString(0)); - }); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseAnd1Test() throws AnalysisException { - String q,e,result; - q = "index=kafka_topic a1 a2"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%a1%' AND _raw LIKE '%a2%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseMultipleParenthesisTest() throws AnalysisException { - String q = "index=kafka_topic conn ( ( error AND toka) OR kolmas )"; - String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%conn%' AND ((_raw LIKE '%error%' AND _raw LIKE '%toka%') OR _raw LIKE '%kolmas%')"; - String result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseParenthesisWithOrTest() throws AnalysisException { - String q,e,result; - q = "index=kafka_topic conn AND ( ( error AND toka ) OR ( kolmas AND n4 ))"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND _raw LIKE '%conn%' AND ((_raw LIKE '%error%' AND _raw LIKE '%toka%') OR (_raw LIKE '%kolmas%' AND _raw LIKE '%n4%'))"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseSimpleParenthesisTest() throws AnalysisException { - String q,e,result; - q = "index=kafka_topic ( conn )"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"kafka_topic\" AND (_raw LIKE '%conn%')"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseHostTest() throws AnalysisException { - String q = "index = archive_memory host = \"localhost\" Deny"; - String e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"localhost\" AND _raw LIKE '%Deny%'"; - String result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseHost1Test() throws AnalysisException { - String q,e,result; - q = "index = archive_memory ( host = \"localhost\" OR host = \"test\" ) AND sourcetype = \"memory\" Deny"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND (host LIKE \"localhost\" OR host LIKE \"test\") AND sourcetype LIKE \"memory\" AND _raw LIKE '%Deny%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void streamListWithoutQuotes1Test() throws AnalysisException { + String q, e, result; + // Test to_lower() for inex,host,sourcetyype + q = "index = MEMORY-test latest=\"05/10/2022:09:11:40\" host= SC-99-99-14-20 sourcetype= LOG:F17:0 Latitude"; + long latestEpoch2 = new DefaultTimeFormat().getEpoch("05/10/2022:09:11:40"); + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"memory-test\" AND _time <= from_unixtime(" + + latestEpoch2 + + ") AND host LIKE \"sc-99-99-14-20\" AND sourcetype LIKE \"log:f17:0\" AND _raw LIKE '%Latitude%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseHost2Test() throws AnalysisException { - String q,e,result; - q = "index = archive_memory host = \"localhost\" host = \"test\" host = \"test1\" Deny"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"localhost\" AND host LIKE \"test\" AND host LIKE \"test1\" AND _raw LIKE '%Deny%'"; - result = utils.getQueryAnalysis(q); - utils.printDebug(e,result); - assertEquals(e,result); - } + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalNotTest() throws AnalysisException { + String q, e, result; + // Test to_lower() for inex,host,sourcetyype + q = "index=f17 sourcetype=log:f17:0 _index_earliest=\"12/31/1970:10:15:30\" _index_latest=\"12/31/2022:10:15:30\" NOT rainfall_rate"; + long earliestEpoch = new DefaultTimeFormat().getEpoch("12/31/1970:10:15:30"); + long latestEpoch = new DefaultTimeFormat().getEpoch("12/31/2022:10:15:30"); + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND sourcetype LIKE \"log:f17:0\" AND _time >= from_unixtime(" + + earliestEpoch + ") AND _time <= from_unixtime(" + latestEpoch + + ") AND NOT _raw LIKE '%rainfall_rate%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseHost3Test() throws AnalysisException { - String q,e,result; - // missing AND in query - q = "index = archive_memory host = \"localhost\" host = \"test\" Deny"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"localhost\" AND host LIKE \"test\" AND _raw LIKE '%Deny%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalNot1Test() throws AnalysisException { + String q, e, result; + // Test to_lower() for inex,host,sourcetyype + q = "index=cpu sourcetype=log:cpu:0 NOT src"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cpu\" AND sourcetype LIKE \"log:cpu:0\" AND NOT _raw LIKE '%src%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseHost4Test() throws AnalysisException { - String q,e,result; - q = "index = archive_memory host = \"localhost\" host = \"test\" host = \"test1\" Deny"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"localhost\" AND host LIKE \"test\" AND host LIKE \"test1\" AND _raw LIKE '%Deny%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalQuotedCompoundTest() throws AnalysisException { + String q, e, result; + // Test to_lower() for inex,host,sourcetyype + q = "index=f17 \"ei yhdys sana\""; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%ei yhdys sana%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseHost5Test() throws AnalysisException { - String q,e,result; - // Same but missing AND in query - q = "index = archive_memory host = \"one\" host = \"two\" host = \"tree\" number"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"archive_memory\" AND host LIKE \"one\" AND host LIKE \"two\" AND host LIKE \"tree\" AND _raw LIKE '%number%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalUnQuotedCompoundTest() throws AnalysisException { + String q, e, result; + // Test to_lower() for inex,host,sourcetyype + q = "index=f17 ei yhdys sana"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%ei%' AND _raw LIKE '%yhdys%' AND _raw LIKE '%sana%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void streamListWithoutQuotesTest() throws AnalysisException { - String q,e,result; - q = "index = memory-test latest=\"05/10/2022:09:11:40\" host= sc-99-99-14-25 sourcetype= log:f17:0 Latitude"; - long latestEpoch = new DefaultTimeFormat().getEpoch("05/10/2022:09:11:40"); - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"memory-test\" AND _time <= from_unixtime("+latestEpoch+") AND host LIKE \"sc-99-99-14-25\" AND sourcetype LIKE \"log:f17:0\" AND _raw LIKE '%Latitude%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalUnQuotedCompound2Test() throws AnalysisException { + String q, e, result; + // Test to_lower() for inex,host,sourcetyype + q = "index=f17 ei AND yhdys sana"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%ei%' AND _raw LIKE '%yhdys%' AND _raw LIKE '%sana%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void streamListWithoutQuotes1Test() throws AnalysisException { - String q,e,result; - // Test to_lower() for inex,host,sourcetyype - q = "index = MEMORY-test latest=\"05/10/2022:09:11:40\" host= SC-99-99-14-20 sourcetype= LOG:F17:0 Latitude"; - long latestEpoch2 = new DefaultTimeFormat().getEpoch("05/10/2022:09:11:40"); - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"memory-test\" AND _time <= from_unixtime("+latestEpoch2+") AND host LIKE \"sc-99-99-14-20\" AND sourcetype LIKE \"log:f17:0\" AND _raw LIKE '%Latitude%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalNotTest() throws AnalysisException { - String q,e,result; - // Test to_lower() for inex,host,sourcetyype - q = "index=f17 sourcetype=log:f17:0 _index_earliest=\"12/31/1970:10:15:30\" _index_latest=\"12/31/2022:10:15:30\" NOT rainfall_rate"; - long earliestEpoch = new DefaultTimeFormat().getEpoch("12/31/1970:10:15:30"); - long latestEpoch = new DefaultTimeFormat().getEpoch("12/31/2022:10:15:30"); - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND sourcetype LIKE \"log:f17:0\" AND _time >= from_unixtime("+earliestEpoch+") AND _time <= from_unixtime("+latestEpoch+") AND NOT _raw LIKE '%rainfall_rate%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalNot1Test() throws AnalysisException { - String q,e,result; - // Test to_lower() for inex,host,sourcetyype - q = "index=cpu sourcetype=log:cpu:0 NOT src"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"cpu\" AND sourcetype LIKE \"log:cpu:0\" AND NOT _raw LIKE '%src%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalQuotedCompoundTest() throws AnalysisException { - String q,e,result; - // Test to_lower() for inex,host,sourcetyype - q = "index=f17 \"ei yhdys sana\""; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%ei yhdys sana%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalUnQuotedCompoundTest() throws AnalysisException { - String q,e,result; - // Test to_lower() for inex,host,sourcetyype - q = "index=f17 ei yhdys sana"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%ei%' AND _raw LIKE '%yhdys%' AND _raw LIKE '%sana%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalUnQuotedCompound2Test() throws AnalysisException { - String q,e,result; - // Test to_lower() for inex,host,sourcetyype - q = "index=f17 ei AND yhdys sana"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%ei%' AND _raw LIKE '%yhdys%' AND _raw LIKE '%sana%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } - - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalQuotedIntTest() throws AnalysisException { - String q,e,result; - // Test to_lower() for inex,host,sourcetyype - q = "index=f17 \"1.2\""; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%1.2%'"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result); - } + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalQuotedIntTest() throws AnalysisException { + String q, e, result; + // Test to_lower() for inex,host,sourcetyype + q = "index=f17 \"1.2\""; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%1.2%'"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result); + } @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalUnQuotedIntTest() throws AnalysisException { - String q,e,result; - // Test to_lower() for inex,host,sourcetyype - q = "index=f17 1.2"; - e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%1.2%'"; - result = utils.getQueryAnalysis(q); - utils.printDebug(e,result); - assertEquals(e,result); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void parseLikeWithParenthesisCatalystTest() { - String q = "index=access_log earliest=\"01/21/2022:10:00:00\" latest=\"01/21/2022:11:59:59\" \"*(3)www(7)example(3)com(0)*\" OR \"*(4)mail(7)example(3)com(0)*\""; - - this.streamingTestUtil.performDPLTest(q, testFile, res -> { - String e = "(RLIKE(index, (?i)^access_log$) AND (((_time >= from_unixtime(1642752000, yyyy-MM-dd HH:mm:ss)) AND (_time < from_unixtime(1642759199, yyyy-MM-dd HH:mm:ss))) AND (RLIKE(_raw, (?i)^.*\\Q(3)www(7)example(3)com(0)\\E.*) OR RLIKE(_raw, (?i)^.*\\Q(4)mail(7)example(3)com(0)\\E.*))))"; - - String result = this.streamingTestUtil.getCtx().getSparkQuery(); - assertEquals(e, result); - }); - } + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalUnQuotedIntTest() throws AnalysisException { + String q, e, result; + // Test to_lower() for inex,host,sourcetyype + q = "index=f17 1.2"; + e = "SELECT * FROM `temporaryDPLView` WHERE index LIKE \"f17\" AND _raw LIKE '%1.2%'"; + result = utils.getQueryAnalysis(q); + utils.printDebug(e, result); + assertEquals(e, result); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void parseLikeWithParenthesisCatalystTest() { + String q = "index=access_log earliest=\"01/21/2022:10:00:00\" latest=\"01/21/2022:11:59:59\" \"*(3)www(7)example(3)com(0)*\" OR \"*(4)mail(7)example(3)com(0)*\""; + + this.streamingTestUtil.performDPLTest(q, testFile, res -> { + String e = "(RLIKE(index, (?i)^access_log$) AND (((_time >= from_unixtime(1642752000, yyyy-MM-dd HH:mm:ss)) AND (_time < from_unixtime(1642759199, yyyy-MM-dd HH:mm:ss))) AND (RLIKE(_raw, (?i)^.*\\Q(3)www(7)example(3)com(0)\\E.*) OR RLIKE(_raw, (?i)^.*\\Q(4)mail(7)example(3)com(0)\\E.*))))"; + + String result = this.streamingTestUtil.getCtx().getSparkQuery(); + assertEquals(e, result); + }); + } } diff --git a/src/test/java/com/teragrep/pth10/relativeTimeTest.java b/src/test/java/com/teragrep/pth10/relativeTimeTest.java index db1128a..3877f84 100644 --- a/src/test/java/com/teragrep/pth10/relativeTimeTest.java +++ b/src/test/java/com/teragrep/pth10/relativeTimeTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,28 +63,29 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class relativeTimeTest { + private static final Logger LOGGER = LoggerFactory.getLogger(relativeTimeTest.class); - private TimeZone originalTimeZone = null; + private TimeZone originalTimeZone = null; // use this file to initialize the streaming dataset String testFile = "src/test/resources/xmlWalkerTestDataStreaming"; private StreamingTestUtil streamingTestUtil; - @BeforeAll - void setEnv() { + @BeforeAll + void setEnv() { // set default timezone - originalTimeZone = TimeZone.getDefault(); - TimeZone.setDefault(TimeZone.getTimeZone("Europe/Helsinki")); + originalTimeZone = TimeZone.getDefault(); + TimeZone.setDefault(TimeZone.getTimeZone("Europe/Helsinki")); this.streamingTestUtil = new StreamingTestUtil(); this.streamingTestUtil.setEnv(); - } - - @BeforeEach - void setUp() { - TimeZone.setDefault(TimeZone.getTimeZone("Europe/Helsinki")); + } + + @BeforeEach + void setUp() { + TimeZone.setDefault(TimeZone.getTimeZone("Europe/Helsinki")); this.streamingTestUtil.setUp(); - } + } @AfterEach void tearDown() { @@ -93,11 +94,14 @@ void tearDown() { @AfterAll void recoverTimeZone() { - TimeZone.setDefault(originalTimeZone); - } + TimeZone.setDefault(originalTimeZone); + } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimeformatTest() { // unix epoch format String q = "index=kafka_topic timeformat=%s earliest=1587032680 latest=1587021942"; @@ -110,8 +114,11 @@ public void parseTimeformatTest() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimeformat_example_default_format_Test() { // default but given manually String q = "index=kafka_topic timeformat=%m/%d/%Y:%H:%M:%S earliest=\"04/16/2020:10:24:40\" latest=\"04/16/2020:10:25:42\""; @@ -120,14 +127,17 @@ public void parseTimeformat_example_default_format_Test() { long latestEpoch = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:42"); String regex = "^.*_time >= from_unixtime\\(1587021880.*_time < from_unixtime\\(" + latestEpoch + ".*$"; - LOGGER.info("Complex timeformat<{}>",q); + LOGGER.info("Complex timeformat<{}>", q); String result = this.streamingTestUtil.getCtx().getSparkQuery(); assertTrue(result.matches(regex)); }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimeformat_custom_format1_Test() { // custom format SS-MM-HH YY-DD-MM String q = "index=kafka_topic timeformat=\"%S-%M-%H %Y-%d-%m\" earliest=\"40-24-10 2020-16-04\" latest=\"42-25-10 2020-16-04\""; @@ -141,8 +151,11 @@ public void parseTimeformat_custom_format1_Test() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimeformat_custom_format2_Test() { // earliest custom format ISO8601 + HH:MM:SS , latest default String q = "index=kafka_topic timeformat=\"%F %T\" earliest=\"2020-04-16 10:24:40\" latest=\"2020-04-16 10:25:42\""; @@ -151,14 +164,17 @@ public void parseTimeformat_custom_format2_Test() { long latestEpoch = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:42"); String regex = "^.*_time >= from_unixtime\\(1587021880.*_time < from_unixtime\\(" + latestEpoch + ".*$"; - LOGGER.info("Complex timeformat<{}>",q); + LOGGER.info("Complex timeformat<{}>", q); String result = this.streamingTestUtil.getCtx().getSparkQuery(); assertTrue(result.matches(regex)); }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimeformat_custom_format3_Test() { // earliest custom format 16 Apr 2020 10.24.40 AM (dd MMM y hh.mm.ss a) , latest default String q = "index=kafka_topic timeformat=\"%d %b %Y %I.%M.%S %p\" earliest=\"16 Apr 2020 10.24.40 AM\" latest=\"16 Apr 2020 10.25.42 AM\""; @@ -173,8 +189,11 @@ public void parseTimeformat_custom_format3_Test() { } @Disabled(value = "starttimeu is not implemented") - @Test // disabled on 2022-05-16 TODO implement - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test // disabled on 2022-05-16 TODO implement + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseStarttimeuTest() { String q = "index=cinnamon starttimeu=1587032680"; @@ -187,7 +206,10 @@ public void parseStarttimeuTest() { @Disabled(value = "endtimeu is not implemented") @Test // disabled on 2022-05-16 TODO implement - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseEndtimeuTest() { String q = "index=cinnamon endtimeu=1587032680"; @@ -199,8 +221,11 @@ public void parseEndtimeuTest() { } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampEarliestRelativeTest() { Timestamp timestamp = new Timestamp(System.currentTimeMillis()); Instant t1 = timestamp.toInstant(); @@ -212,23 +237,27 @@ public void parseTimestampEarliestRelativeTest() { LocalDateTime etime = LocalDateTime.ofInstant(exp, ZoneOffset.UTC); RelativeTimestamp rtTimestamp = rtParser.parse("-1h"); long rtEpoch = rtTimestamp.calculate(timestamp); - assertEquals(etime.getHour(), LocalDateTime.ofInstant(Instant.ofEpochSecond(rtEpoch), ZoneOffset.UTC).getHour()); + assertEquals( + etime.getHour(), LocalDateTime.ofInstant(Instant.ofEpochSecond(rtEpoch), ZoneOffset.UTC).getHour() + ); // -3 min exp = t1.plus(-3, ChronoUnit.MINUTES); etime = LocalDateTime.ofInstant(exp, ZoneOffset.UTC); rtTimestamp = rtParser.parse("-3m"); rtEpoch = rtTimestamp.calculate(timestamp); - assertEquals(etime.getMinute(), - LocalDateTime.ofInstant(Instant.ofEpochSecond(rtEpoch), ZoneOffset.UTC).getMinute()); + assertEquals( + etime.getMinute(), LocalDateTime.ofInstant(Instant.ofEpochSecond(rtEpoch), ZoneOffset.UTC).getMinute() + ); // Using localDateTime-method // -1 week LocalDateTime dt = timestamp.toLocalDateTime(); LocalDateTime et = dt.minusWeeks(1); rtTimestamp = rtParser.parse("-1w"); rtEpoch = rtTimestamp.calculate(timestamp); - assertEquals(et.getDayOfWeek(), - LocalDateTime.ofInstant(Instant.ofEpochSecond(rtEpoch), ZoneOffset.UTC).getDayOfWeek()); + assertEquals( + et.getDayOfWeek(), LocalDateTime.ofInstant(Instant.ofEpochSecond(rtEpoch), ZoneOffset.UTC).getDayOfWeek() + ); // -3 month dt = timestamp.toLocalDateTime(); et = dt.minusMonths(3); @@ -243,30 +272,37 @@ public void parseTimestampEarliestRelativeTest() { rtEpoch = rtTimestamp.calculate(timestamp); assertEquals(et.getYear(), LocalDateTime.ofInstant(Instant.ofEpochSecond(rtEpoch), ZoneOffset.UTC).getYear()); } - + // test snap-to-time "@d" - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampSnapToTimeRelativeTest() { - // Test for snap-to-time functionality - // for example "-@d" would snap back to midnight of set day - long epochSeconds = 1643881600; // Thursday, February 3, 2022 09:46:40 UTC - Timestamp timestamp = new Timestamp(epochSeconds*1000L); + // Test for snap-to-time functionality + // for example "-@d" would snap back to midnight of set day + long epochSeconds = 1643881600; // Thursday, February 3, 2022 09:46:40 UTC + Timestamp timestamp = new Timestamp(epochSeconds * 1000L); RelativeTimeParser rtParser = new RelativeTimeParser(); - - LocalDateTime dt = timestamp.toLocalDateTime(); + + LocalDateTime dt = timestamp.toLocalDateTime(); LocalDateTime et = dt.minusHours(9); et = et.minusMinutes(46); et = et.minusSeconds(40); // Thu Feb 3, 2022 00:00 UTC RelativeTimestamp rtTimestamp = rtParser.parse("@d"); long rtEpoch = rtTimestamp.calculate(timestamp); - assertEquals(et.getDayOfWeek(), - LocalDateTime.ofInstant(Instant.ofEpochSecond(rtEpoch), ZoneOffset.systemDefault()).getDayOfWeek()); + assertEquals( + et.getDayOfWeek(), LocalDateTime.ofInstant(Instant.ofEpochSecond(rtEpoch), ZoneOffset.systemDefault()).getDayOfWeek() + ); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampLatestRelativeTest() { String q = "index=cinnamon latest=-3h "; @@ -282,7 +318,10 @@ public void parseTimestampLatestRelativeTest() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampLatestRelativeTestWithPlus() { String q = "index=cinnamon latest=+3h "; @@ -298,19 +337,26 @@ public void parseTimestampLatestRelativeTestWithPlus() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampLatestRelativeTestWithoutSign() { String q = "index=cinnamon latest=3h "; String expected = "TimeQualifier conversion error: <3h> can't be parsed."; - RuntimeException exception = - this.streamingTestUtil.performThrowingDPLTest(RuntimeException.class, q, this.testFile, res -> {}); + RuntimeException exception = this.streamingTestUtil + .performThrowingDPLTest(RuntimeException.class, q, this.testFile, res -> { + }); assertEquals(expected, exception.getMessage()); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampLatestRelativeSnapTest() { String q = "index=cinnamon latest=@d "; @@ -326,8 +372,11 @@ public void parseTimestampLatestRelativeSnapTest() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampLatestRelativeSnapWithOffsetTest() { String q = "index=cinnamon latest=@d+3h "; @@ -344,8 +393,11 @@ public void parseTimestampLatestRelativeSnapWithOffsetTest() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampLatestRelativeNowTest() { String q = "index=cinnamon latest=now "; @@ -360,10 +412,13 @@ public void parseTimestampLatestRelativeNowTest() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampRelativeTicket24QueryTest() { - // pth10 ticket #24 query: 'index=... sourcetype=... earliest=@d latest=now' + // pth10 ticket #24 query: 'index=... sourcetype=... earliest=@d latest=now' String q = "index=cinnamon earliest=@d latest=now"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { @@ -372,42 +427,55 @@ public void parseTimestampRelativeTicket24QueryTest() { long expectedEarliest = now.truncatedTo(ChronoUnit.DAYS).toInstant().getEpochSecond(); long expectedLatest = now.toEpochSecond(); String expectedLatestString = String.valueOf(expectedLatest).substring(0, 7); // don't check last 2 numbers as the query takes some time and the "now" is different - String regex = "^.*_time >= from_unixtime\\(" + expectedEarliest + ".*_time < from_unixtime\\(" + expectedLatestString + ".*$";; + String regex = "^.*_time >= from_unixtime\\(" + expectedEarliest + ".*_time < from_unixtime\\(" + + expectedLatestString + ".*$"; + ; String result = this.streamingTestUtil.getCtx().getSparkQuery(); assertTrue(result.matches(regex)); }); } - + // should throw an exception - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampRelativeTicket66QueryTest() { - // pth10 ticket #66 query: 'index=... sourcetype=... earliest=@-5h latest=@-3h' + // pth10 ticket #66 query: 'index=... sourcetype=... earliest=@-5h latest=@-3h' String query = "index=cinnamon earliest=\"@-5h\" latest=\"@-3h\""; String expected = "TimeQualifier conversion error: <@-5h> can't be parsed."; - RuntimeException exception = - this.streamingTestUtil.performThrowingDPLTest(RuntimeException.class, query, this.testFile, res -> {}); + RuntimeException exception = this.streamingTestUtil + .performThrowingDPLTest(RuntimeException.class, query, this.testFile, res -> { + }); assertEquals(expected, exception.getMessage()); } - + // should throw an exception - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampRelativeInvalidTimeUnitQueryTest() { String q = "index=cinnamon earliest=-5x latest=-7z"; String e = "Relative timestamp contained an invalid time unit"; - - Throwable exception = - this.streamingTestUtil.performThrowingDPLTest(RuntimeException.class, q, this.testFile, res -> {}); - + + Throwable exception = this.streamingTestUtil + .performThrowingDPLTest(RuntimeException.class, q, this.testFile, res -> { + }); + assertEquals(e, exception.getMessage()); } - + // test with quotes - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampRelativeWithQuotesTest() { String q = "index=cinnamon earliest=\"-3h@h\" latest=\"-1h@h\""; @@ -416,15 +484,19 @@ public void parseTimestampRelativeWithQuotesTest() { Instant now = timestamp.toInstant(); Instant earliest = now.minus(3L, ChronoUnit.HOURS).truncatedTo(ChronoUnit.HOURS); Instant latest = now.minus(1L, ChronoUnit.HOURS).truncatedTo(ChronoUnit.HOURS); - String regex = "^.*_time >= from_unixtime\\(" + earliest.getEpochSecond() + ".*_time < from_unixtime\\(" + latest.getEpochSecond() + ".*$"; + String regex = "^.*_time >= from_unixtime\\(" + earliest.getEpochSecond() + ".*_time < from_unixtime\\(" + + latest.getEpochSecond() + ".*$"; String result = this.streamingTestUtil.getCtx().getSparkQuery(); assertTrue(result.matches(regex)); }); } - + // test with -h - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampRelativeWithoutExplicitAmountOfTimeTest() { String q = "index=cinnamon earliest=\"-h\""; @@ -438,10 +510,13 @@ public void parseTimestampRelativeWithoutExplicitAmountOfTimeTest() { assertTrue(result.matches(regex)); }); } - + // test with relative timestamp, snap to time and offset - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampRelativeComplexTest() { String q = "index=cinnamon earliest=\"-3h@d+1d\""; @@ -456,9 +531,12 @@ public void parseTimestampRelativeComplexTest() { assertTrue(result.matches(regex)); }); } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampEarliestTest() { String q; // earliest @@ -472,8 +550,11 @@ public void parseTimestampEarliestTest() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampLatestTest() { String q; // latest @@ -487,15 +568,19 @@ public void parseTimestampLatestTest() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampEarliestLatestTest() { String q; // earliest, latest q = "index=cinnamon earliest=\"04/16/2020:10:25:40\" latest=\"04/16/2020:10:25:42\""; long earliestEpoch2 = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:40"); long latestEpoch2 = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:42"); - String regex = "^.*_time >= from_unixtime\\(" + earliestEpoch2 + ".*_time < from_unixtime\\(" + latestEpoch2 + ".*$"; + String regex = "^.*_time >= from_unixtime\\(" + earliestEpoch2 + ".*_time < from_unixtime\\(" + latestEpoch2 + + ".*$"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { String result = this.streamingTestUtil.getCtx().getSparkQuery(); @@ -503,14 +588,18 @@ public void parseTimestampEarliestLatestTest() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void parseTimestampIndexEarliestLatestTest() { // _index_earliest, _index_latest String q = "index=cinnamon _index_earliest=\"04/16/2020:10:25:40\" _index_latest=\"04/16/2020:10:25:42\""; long indexEarliestEpoch = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:40"); long indexLatestEpoch = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:42"); - String regex = "^.*_time >= from_unixtime\\(" + indexEarliestEpoch + ".*_time < from_unixtime\\(" + indexLatestEpoch + ".*$"; + String regex = "^.*_time >= from_unixtime\\(" + indexEarliestEpoch + ".*_time < from_unixtime\\(" + + indexLatestEpoch + ".*$"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { String result = this.streamingTestUtil.getCtx().getSparkQuery(); @@ -518,13 +607,17 @@ public void parseTimestampIndexEarliestLatestTest() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void streamListTest() { String q = "index = memory earliest=\"05/08/2019:09:10:40\" latest=\"05/10/2022:09:11:40\" host=\"sc-99-99-14-25\" OR host=\"sc-99-99-14-20\" sourcetype=\"log:f17:0\" Latitude"; long earliestEpoch = new DefaultTimeFormat().getEpoch("05/08/2019:09:10:40"); long latestEpoch = new DefaultTimeFormat().getEpoch("05/10/2022:09:11:40"); - String regex = "^.*_time >= from_unixtime\\(" + earliestEpoch + ".*_time < from_unixtime\\(" + latestEpoch + ".*$"; + String regex = "^.*_time >= from_unixtime\\(" + earliestEpoch + ".*_time < from_unixtime\\(" + latestEpoch + + ".*$"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { String result = this.streamingTestUtil.getCtx().getSparkQuery(); @@ -532,13 +625,17 @@ public void streamListTest() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void streamList1Test() { String q = "index = memory-test earliest=\"05/08/2019:09:10:40\" latest=\"05/10/2022:09:11:40\" host=\"sc-99-99-14-25\" OR host=\"sc-99-99-14-20\" sourcetype=\"log:f17:0\" Latitude"; long earliestEpoch2 = new DefaultTimeFormat().getEpoch("05/08/2019:09:10:40"); long latestEpoch2 = new DefaultTimeFormat().getEpoch("05/10/2022:09:11:40"); - String regex = "^.*_time >= from_unixtime\\(" + earliestEpoch2 + ".*_time < from_unixtime\\(" + latestEpoch2 + ".*$"; + String regex = "^.*_time >= from_unixtime\\(" + earliestEpoch2 + ".*_time < from_unixtime\\(" + latestEpoch2 + + ".*$"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { String result = this.streamingTestUtil.getCtx().getSparkQuery(); @@ -546,17 +643,21 @@ public void streamList1Test() { }); } - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) public void streamList2Test() { String q = "index = memory-test/yyy earliest=\"05/08/2019:09:10:40\" latest=\"05/10/2022:09:11:40\" host=\"sc-99-99-14-25\" OR host=\"sc-99-99-14-20\" sourcetype=\"log:f17:0\" Latitude"; long earliestEpoch3 = new DefaultTimeFormat().getEpoch("05/08/2019:09:10:40"); long latestEpoch3 = new DefaultTimeFormat().getEpoch("05/10/2022:09:11:40"); - String regex = "^.*_time >= from_unixtime\\(" + earliestEpoch3 + ".*_time < from_unixtime\\(" + latestEpoch3 + ".*$"; + String regex = "^.*_time >= from_unixtime\\(" + earliestEpoch3 + ".*_time < from_unixtime\\(" + latestEpoch3 + + ".*$"; this.streamingTestUtil.performDPLTest(q, this.testFile, res -> { String result = this.streamingTestUtil.getCtx().getSparkQuery(); assertTrue(result.matches(regex)); }); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/statsTransformationStreamingTest.java b/src/test/java/com/teragrep/pth10/statsTransformationStreamingTest.java index ee2f9e8..ee481cb 100644 --- a/src/test/java/com/teragrep/pth10/statsTransformationStreamingTest.java +++ b/src/test/java/com/teragrep/pth10/statsTransformationStreamingTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -64,171 +64,251 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class statsTransformationStreamingTest { - private static final Logger LOGGER = LoggerFactory.getLogger(statsTransformationStreamingTest.class); - - private final String testFile = "src/test/resources/predictTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); - - private StreamingTestUtil streamingTestUtil; - - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(this.testSchema); - this.streamingTestUtil.setEnv(); - } - - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } - - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } - - - // ---------------------------------------- - // Tests - // ---------------------------------------- - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void statsTransform_Streaming_AggDistinctCount_Test() { - streamingTestUtil.performDPLTest( - "index=index_A | stats dc(offset) AS stats_test_result", - testFile, - ds -> { - List listOfResult = ds.select("stats_test_result").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("25"), listOfResult, "Batch consumer dataset did not contain the expected values !"); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void statsTransform_Streaming_AggEarliest_Test() { - streamingTestUtil.performDPLTest( - "index=index_A | stats earliest(offset) AS stats_test_result", - testFile, - ds -> { - List listOfResult = ds.select("stats_test_result").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Arrays.asList("15"), listOfResult, "Batch consumer dataset did not contain the expected values !"); - }); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void testSplittingByTime() { - streamingTestUtil.performDPLTest( - "index=index_A | stats avg(offset) AS stats_test_result BY _time", - testFile, - ds -> { - List listOfResult = ds.select("stats_test_result").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("15.0", "16.0", "17.0", "18.0", "19.0", "20.0", "21.0", - "22.0", "23.0", "24.0", "13.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", - "10.0", "11.0", "12.0", "13.0", "14.0"); // weird timestamps in the JSON file - assertEquals(expected, listOfResult, "Batch consumer dataset did not contain the expected values !"); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void testSplittingByString() { - streamingTestUtil.performDPLTest( - "index=index_A | stats avg(offset) AS stats_test_result BY sourcetype", - testFile, - ds -> { - List listOfResult = ds.select("stats_test_result").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("3.0", "8.0", "13.0", "18.0", "23.0"); - assertEquals(expected, listOfResult, "Batch consumer dataset did not contain the expected values !"); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void testSplittingByNumber() { - streamingTestUtil.performDPLTest( - "index=index_A | stats avg(offset) AS stats_test_result BY id", - testFile, - ds -> { - List listOfResult = ds.select("stats_test_result").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", - "10.0", "11.0", "12.0", "13.0", "14.0", "15.0", "16.0", "17.0", "18.0", "19.0", "20.0", "21.0", - "22.0", "23.0", "24.0", "25.0"); - assertEquals(expected, listOfResult, "Batch consumer dataset did not contain the expected values !"); - } - ); - } - - // Sorts first by sourcetype and then in those sourcetypes it sorts by _time - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void testSplittingByMultipleColumns() { - streamingTestUtil.performDPLTest( - "index=index_A | stats avg(offset) AS stats_test_result BY sourcetype _time", - testFile, - ds -> { - List listOfResult = ds.select("stats_test_result").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", - "10.0", "15.0", "11.0", "12.0", "13.0", "14.0", "16.0", "17.0", "18.0", "19.0", "20.0", "21.0", - "22.0", "23.0", "24.0", "25.0"); - assertEquals(expected, listOfResult, "Batch consumer dataset did not contain the expected values !"); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void testSplittingByNumericalStrings() { - streamingTestUtil.performDPLTest( - "index=index_A | eval a = offset + 0 | stats avg(offset) AS stats_test_result BY a", - testFile, - ds -> { - List listOfResult = ds.select("stats_test_result").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List expected = Arrays.asList("1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", - "10.0", "11.0", "12.0", "13.0", "14.0", "15.0", "16.0", "17.0", "18.0", "19.0", "20.0", "21.0", - "22.0", "23.0", "24.0", "25.0"); - assertEquals(expected, listOfResult, "Batch consumer dataset did not contain the expected values !"); - } - ); - } - - @Test - public void statsTransform_Streaming_AggValues_Test() { - streamingTestUtil.performDPLTest( - "index=index_A | stats values(offset) AS stats_test_result", - testFile, - ds -> { - List listOfResult = ds.select("stats_test_result").collectAsList().stream() - .map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("1\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n2\n20\n21\n22\n23\n24\n25\n3\n4\n5\n6\n7\n8\n9"), - listOfResult, "Batch consumer dataset did not contain the expected values !"); - }); - } - - @Test - public void statsTransform_Streaming_AggExactPerc_Test() { - streamingTestUtil.performDPLTest( - "index=index_A | stats exactperc50(offset) AS stats_test_result", - testFile, - ds -> { - List listOfResult = ds.select("stats_test_result").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("13.0"), listOfResult, "Batch consumer dataset did not contain the expected values !"); - }); - } + + private static final Logger LOGGER = LoggerFactory.getLogger(statsTransformationStreamingTest.class); + + private final String testFile = "src/test/resources/predictTransformationTest_data*.json"; // * to make the path into a directory path + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); + + private StreamingTestUtil streamingTestUtil; + + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(this.testSchema); + this.streamingTestUtil.setEnv(); + } + + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } + + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } + + // ---------------------------------------- + // Tests + // ---------------------------------------- + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void statsTransform_Streaming_AggDistinctCount_Test() { + streamingTestUtil.performDPLTest("index=index_A | stats dc(offset) AS stats_test_result", testFile, ds -> { + List listOfResult = ds + .select("stats_test_result") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals( + Arrays.asList("25"), listOfResult, "Batch consumer dataset did not contain the expected values !" + ); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void statsTransform_Streaming_AggEarliest_Test() { + streamingTestUtil + .performDPLTest("index=index_A | stats earliest(offset) AS stats_test_result", testFile, ds -> { + List listOfResult = ds + .select("stats_test_result") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals( + Arrays.asList("15"), listOfResult, "Batch consumer dataset did not contain the expected values !" + ); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void testSplittingByTime() { + streamingTestUtil + .performDPLTest("index=index_A | stats avg(offset) AS stats_test_result BY _time", testFile, ds -> { + List listOfResult = ds + .select("stats_test_result") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays + .asList( + "15.0", "16.0", "17.0", "18.0", "19.0", "20.0", "21.0", "22.0", "23.0", "24.0", + "13.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", "10.0", "11.0", + "12.0", "13.0", "14.0" + ); // weird timestamps in the JSON file + assertEquals( + expected, listOfResult, "Batch consumer dataset did not contain the expected values !" + ); + }); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void testSplittingByString() { + streamingTestUtil + .performDPLTest( + "index=index_A | stats avg(offset) AS stats_test_result BY sourcetype", testFile, ds -> { + List listOfResult = ds + .select("stats_test_result") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays.asList("3.0", "8.0", "13.0", "18.0", "23.0"); + assertEquals( + expected, listOfResult, + "Batch consumer dataset did not contain the expected values !" + ); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void testSplittingByNumber() { + streamingTestUtil + .performDPLTest("index=index_A | stats avg(offset) AS stats_test_result BY id", testFile, ds -> { + List listOfResult = ds + .select("stats_test_result") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays + .asList( + "1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", "10.0", "11.0", + "12.0", "13.0", "14.0", "15.0", "16.0", "17.0", "18.0", "19.0", "20.0", "21.0", + "22.0", "23.0", "24.0", "25.0" + ); + assertEquals( + expected, listOfResult, "Batch consumer dataset did not contain the expected values !" + ); + }); + } + + // Sorts first by sourcetype and then in those sourcetypes it sorts by _time + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void testSplittingByMultipleColumns() { + streamingTestUtil + .performDPLTest( + "index=index_A | stats avg(offset) AS stats_test_result BY sourcetype _time", testFile, ds -> { + List listOfResult = ds + .select("stats_test_result") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays + .asList( + "1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", "10.0", + "15.0", "11.0", "12.0", "13.0", "14.0", "16.0", "17.0", "18.0", "19.0", + "20.0", "21.0", "22.0", "23.0", "24.0", "25.0" + ); + assertEquals( + expected, listOfResult, + "Batch consumer dataset did not contain the expected values !" + ); + } + ); + } + + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void testSplittingByNumericalStrings() { + streamingTestUtil + .performDPLTest( + "index=index_A | eval a = offset + 0 | stats avg(offset) AS stats_test_result BY a", testFile, + ds -> { + List listOfResult = ds + .select("stats_test_result") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List expected = Arrays + .asList( + "1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0", "10.0", + "11.0", "12.0", "13.0", "14.0", "15.0", "16.0", "17.0", "18.0", "19.0", + "20.0", "21.0", "22.0", "23.0", "24.0", "25.0" + ); + assertEquals( + expected, listOfResult, + "Batch consumer dataset did not contain the expected values !" + ); + } + ); + } + + @Test + public void statsTransform_Streaming_AggValues_Test() { + streamingTestUtil.performDPLTest("index=index_A | stats values(offset) AS stats_test_result", testFile, ds -> { + List listOfResult = ds + .select("stats_test_result") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals( + Collections + .singletonList( + "1\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n2\n20\n21\n22\n23\n24\n25\n3\n4\n5\n6\n7\n8\n9" + ), + listOfResult, "Batch consumer dataset did not contain the expected values !" + ); + }); + } + + @Test + public void statsTransform_Streaming_AggExactPerc_Test() { + streamingTestUtil + .performDPLTest("index=index_A | stats exactperc50(offset) AS stats_test_result", testFile, ds -> { + List listOfResult = ds + .select("stats_test_result") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals( + Collections.singletonList("13.0"), listOfResult, "Batch consumer dataset did not contain the expected values !" + ); + }); + } } - diff --git a/src/test/java/com/teragrep/pth10/statsTransformationTest.java b/src/test/java/com/teragrep/pth10/statsTransformationTest.java index 48f2552..61ba233 100644 --- a/src/test/java/com/teragrep/pth10/statsTransformationTest.java +++ b/src/test/java/com/teragrep/pth10/statsTransformationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -60,6 +60,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class statsTransformationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(statsTransformationTest.class); // Use this file for dataset initialization @@ -82,18 +83,17 @@ void tearDown() { this.streamingTestUtil.tearDown(); } - // --- SQL emit mode tests --- - + // --- XML emit mode tests --- - + // --- Catalyst emit mode tests --- // Explanation: // UDAF = User Defined Aggregate Function, deprecated in spark 3.x, not performant enough in many cases // aggregator = custom aggregator, replaces UDAF in spark 3.x and above, performance vastly improved compared to UDAF // spark = uses built-in spark function - + /* * -- Command -- -- Status -- * exactperc() aggregator @@ -123,499 +123,728 @@ void tearDown() { * latest() aggregator * latest_time() aggregator */ - + // Test exactpercX() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggExactPerc_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats exactperc50(offset) AS perc_offset", - testFile, - ds -> { - assertEquals("[perc_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats exactperc50(offset) AS perc_offset", testFile, ds -> { + assertEquals("[perc_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("perc_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("6.5"), destAsList); - }); + List destAsList = ds + .select("perc_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("6.5"), destAsList); + }); } - + // Test percX() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggPerc_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats perc50(offset) AS perc_offset", - testFile, - ds -> { - assertEquals("[perc_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats perc50(offset) AS perc_offset", testFile, ds -> { + assertEquals("[perc_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("perc_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("6"), destAsList); - }); + List destAsList = ds + .select("perc_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("6"), destAsList); + }); } - + // Test rate() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggRate_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats rate(offset) AS rate_offset", - testFile, - ds -> { - assertEquals("[rate_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats rate(offset) AS rate_offset", testFile, ds -> { + assertEquals("[rate_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("rate_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("3.2425553062149416E-8"), destAsList); - }); + List destAsList = ds + .select("rate_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("3.2425553062149416E-8"), destAsList); + }); } - + // Test earliest() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggEarliest_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats earliest(offset) AS earliest_offset", - testFile, - ds -> { - assertEquals("[earliest_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats earliest(offset) AS earliest_offset", testFile, ds -> { + assertEquals("[earliest_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("earliest_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("1"), destAsList); - }); + List destAsList = ds + .select("earliest_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("1"), destAsList); + }); } // Test earliest() with no data @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggEarliestNoData_Test() { - streamingTestUtil.performDPLTest("index=index_XYZ | stats earliest(offset) AS earliest_offset", - testFile, - ds -> { + streamingTestUtil + .performDPLTest("index=index_XYZ | stats earliest(offset) AS earliest_offset", testFile, ds -> { assertEquals("[earliest_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("earliest_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List destAsList = ds + .select("earliest_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(Collections.singletonList(""), destAsList); }); } // Test latest() with no data @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggLatestNoData_Test() { - streamingTestUtil.performDPLTest("index=index_XYZ | stats latest(offset) AS latest_offset", - testFile, - ds -> { - assertEquals("[latest_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_XYZ | stats latest(offset) AS latest_offset", testFile, ds -> { + assertEquals("[latest_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("latest_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList(""), destAsList); - }); + List destAsList = ds + .select("latest_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList(""), destAsList); + }); } - + // Test earliest_time() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggEarliestTime_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats earliest_time(offset) AS earliest_time_offset", - testFile, - ds -> { + streamingTestUtil + .performDPLTest("index=index_A | stats earliest_time(offset) AS earliest_time_offset", testFile, ds -> { assertEquals("[earliest_time_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("earliest_time_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List destAsList = ds + .select("earliest_time_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(Collections.singletonList("978310861"), destAsList); }); } - + // Test values() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggValues_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats values(offset) AS values_offset", - testFile, - ds -> { - assertEquals("[values_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats values(offset) AS values_offset", testFile, ds -> { + assertEquals("[values_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("values_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("1\n10\n11\n2\n3\n4\n5\n6\n7\n8\n9"), destAsList); - }); + List destAsList = ds + .select("values_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("1\n10\n11\n2\n3\n4\n5\n6\n7\n8\n9"), destAsList); + }); } - + // Test list() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggList_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats list(offset) AS list_offset", - testFile, - ds -> { - assertEquals("[list_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats list(offset) AS list_offset", testFile, ds -> { + assertEquals("[list_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("list_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n11"), destAsList); - }); + List destAsList = ds + .select("list_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n11"), destAsList); + }); } - + // Test median() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggMedian_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats median(offset) AS median_offset", - testFile, - ds -> { - assertEquals("[median_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats median(offset) AS median_offset", testFile, ds -> { + assertEquals("[median_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("median_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("6.5"), destAsList); - }); + List destAsList = ds + .select("median_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("6.5"), destAsList); + }); } - - // Test mode() + + // Test mode() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggMode_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats mode(offset) AS mode_offset", - testFile, - ds -> { - assertEquals("[mode_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats mode(offset) AS mode_offset", testFile, ds -> { + assertEquals("[mode_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("mode_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("11"), destAsList); - }); + List destAsList = ds + .select("mode_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("11"), destAsList); + }); } - + // Test min() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggMin_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats min(offset) AS min_offset", - testFile, - ds -> { - assertEquals("[min_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats min(offset) AS min_offset", testFile, ds -> { + assertEquals("[min_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("min_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("1"), destAsList); - }); + List destAsList = ds + .select("min_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("1"), destAsList); + }); } - + // Test max() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggMax_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats max(offset) AS max_offset", - testFile, - ds -> { - assertEquals("[max_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats max(offset) AS max_offset", testFile, ds -> { + assertEquals("[max_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("max_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("11"), destAsList); - }); + List destAsList = ds + .select("max_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("11"), destAsList); + }); } - + // Test stdev() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggStdev_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats stdev(offset) AS stdev_offset", - testFile, - ds -> { - assertEquals("[stdev_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats stdev(offset) AS stdev_offset", testFile, ds -> { + assertEquals("[stdev_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("stdev_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("3.4761089357690347"), destAsList); - }); + List destAsList = ds + .select("stdev_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("3.4761089357690347"), destAsList); + }); } - + // Test stdevp() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggStdevp_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats stdevp(offset) AS stdevp_offset", - testFile, - ds -> { - assertEquals("[stdevp_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats stdevp(offset) AS stdevp_offset", testFile, ds -> { + assertEquals("[stdevp_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("stdevp_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("3.3281209246193093"), destAsList); - }); + List destAsList = ds + .select("stdevp_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("3.3281209246193093"), destAsList); + }); } - + // Test sum() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggSum_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats sum(offset) AS sum_offset", - testFile, - ds -> { - assertEquals("[sum_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats sum(offset) AS sum_offset", testFile, ds -> { + assertEquals("[sum_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("sum_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("77"), destAsList); - }); + List destAsList = ds + .select("sum_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("77"), destAsList); + }); } // Test sum() with MV field input @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggSum_mvField_Test() { - streamingTestUtil.performDPLTest("index=index_A | eval mv = mvappend(offset, offset+1) | stats sum(mv) AS sum_mv", - testFile, - ds -> { - assertEquals("[sum_mv]", Arrays.toString(ds.columns())); - - List destAsList = ds.select("sum_mv").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("166"), destAsList); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | eval mv = mvappend(offset, offset+1) | stats sum(mv) AS sum_mv", testFile, + ds -> { + assertEquals("[sum_mv]", Arrays.toString(ds.columns())); + + List destAsList = ds + .select("sum_mv") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("166"), destAsList); + } + ); } // Test sum() with MV field input @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggSum_mvField_GH261_Test() { - streamingTestUtil.performDPLTest("index=index_A offset < 3" + - "| eval atk = if(offset=0, 1, 0) " + - "| eval def = if(offset=1, 2, 1) " + - "| eval spy = if(offset=2, 4, 6)" + - "| stats sum(atk) AS attack, sum(def) AS defend, sum(spy) as spying", testFile, - ds -> { - List atk = ds.select("attack").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List def = ds.select("defend").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - List spy = ds.select("spying").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - - // should be one of each - assertEquals(1, atk.size()); - assertEquals(1, def.size()); - assertEquals(1, spy.size()); - // aggregate results - assertEquals("0", atk.get(0)); - assertEquals("3", def.get(0)); - assertEquals("10", spy.get(0)); - }); + streamingTestUtil + .performDPLTest( + "index=index_A offset < 3" + "| eval atk = if(offset=0, 1, 0) " + + "| eval def = if(offset=1, 2, 1) " + "| eval spy = if(offset=2, 4, 6)" + + "| stats sum(atk) AS attack, sum(def) AS defend, sum(spy) as spying", + testFile, ds -> { + List atk = ds + .select("attack") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List def = ds + .select("defend") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + List spy = ds + .select("spying") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + + // should be one of each + assertEquals(1, atk.size()); + assertEquals(1, def.size()); + assertEquals(1, spy.size()); + // aggregate results + assertEquals("0", atk.get(0)); + assertEquals("3", def.get(0)); + assertEquals("10", spy.get(0)); + } + ); } - + // Test sumsq() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggSumsq_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats sumsq(offset) AS sumsq_offset", - testFile, - ds -> { - assertEquals("[sumsq_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats sumsq(offset) AS sumsq_offset", testFile, ds -> { + assertEquals("[sumsq_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("sumsq_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("627.0"), destAsList); - }); + List destAsList = ds + .select("sumsq_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("627.0"), destAsList); + }); } - + // Test dc() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggDc_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats dc(offset) AS dc_offset", - testFile, - ds -> { - assertEquals("[dc_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats dc(offset) AS dc_offset", testFile, ds -> { + assertEquals("[dc_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("dc_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("11"), destAsList); - }); + List destAsList = ds + .select("dc_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("11"), destAsList); + }); } // Test dc() with NULL data @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggDc_NoData_Test() { // rex4j is used to produce nulls here - streamingTestUtil.performDPLTest("| makeresults | eval raw=\"kissa@1\"| rex4j field=raw \"koira@(?\\d)\" | stats dc(koira)", - testFile, - ds -> { - assertEquals("[dc(koira)]", Arrays.toString(ds.columns())); - - List destAsList = ds.select("dc(koira)").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("0"), destAsList); - }); + streamingTestUtil + .performDPLTest( + "| makeresults | eval raw=\"kissa@1\"| rex4j field=raw \"koira@(?\\d)\" | stats dc(koira)", + testFile, ds -> { + assertEquals("[dc(koira)]", Arrays.toString(ds.columns())); + + List destAsList = ds + .select("dc(koira)") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("0"), destAsList); + } + ); } - + // Test estdc() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggEstdc_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats estdc(offset) AS estdc_offset", - testFile, - ds -> { - assertEquals("[estdc_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats estdc(offset) AS estdc_offset", testFile, ds -> { + assertEquals("[estdc_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("estdc_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("11"), destAsList); - }); + List destAsList = ds + .select("estdc_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("11"), destAsList); + }); } - + // Test estdc_error() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggEstdc_error_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats estdc_error(offset) AS estdc_error_offset", - testFile, - ds -> { + streamingTestUtil + .performDPLTest("index=index_A | stats estdc_error(offset) AS estdc_error_offset", testFile, ds -> { assertEquals("[estdc_error_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("estdc_error_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List destAsList = ds + .select("estdc_error_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(Collections.singletonList("0.0"), destAsList); }); } - + // Test range() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggRange_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats range(offset) AS range_offset", - testFile, - ds -> { - assertEquals("[range_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats range(offset) AS range_offset", testFile, ds -> { + assertEquals("[range_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("range_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("10"), destAsList); - }); + List destAsList = ds + .select("range_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("10"), destAsList); + }); } - + // Test count() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggCount_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats count(offset) AS count_offset", - testFile, - ds -> { - assertEquals("[count_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats count(offset) AS count_offset", testFile, ds -> { + assertEquals("[count_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("count_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("12"), destAsList); - }); + List destAsList = ds + .select("count_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("12"), destAsList); + }); } - + // Test avg() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggAvg_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats avg(offset)", - testFile, - ds -> { - assertEquals("[avg(offset)]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats avg(offset)", testFile, ds -> { + assertEquals("[avg(offset)]", Arrays.toString(ds.columns())); - List destAsList = ds.select("avg(offset)").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("6.416666666666667"), destAsList); - }); + List destAsList = ds + .select("avg(offset)") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("6.416666666666667"), destAsList); + }); } - + // Test mean() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggMean_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats mean(offset)", - testFile, - ds -> { - assertEquals("[mean(offset)]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats mean(offset)", testFile, ds -> { + assertEquals("[mean(offset)]", Arrays.toString(ds.columns())); - List destAsList = ds.select("mean(offset)").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("6.416666666666667"), destAsList); - }); + List destAsList = ds + .select("mean(offset)") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("6.416666666666667"), destAsList); + }); } - + // Test var() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggVar_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats var(offset) AS var_offset", - testFile, - ds -> { - assertEquals("[var_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats var(offset) AS var_offset", testFile, ds -> { + assertEquals("[var_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("var_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("12.083333333333332"), destAsList); - }); + List destAsList = ds + .select("var_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("12.083333333333332"), destAsList); + }); } - + // Test varp() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggVarp_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats varp(offset) AS varp_offset", - testFile, - ds -> { - assertEquals("[varp_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats varp(offset) AS varp_offset", testFile, ds -> { + assertEquals("[varp_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("varp_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("11.076388888888888"), destAsList); - }); + List destAsList = ds + .select("varp_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("11.076388888888888"), destAsList); + }); } - + // Test multiple aggregations at once @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_Agg_MultipleTest() { - streamingTestUtil.performDPLTest("index=index_A | stats var(offset) AS var_offset avg(offset) AS avg_offset", - testFile, - ds -> { - assertEquals("[var_offset, avg_offset]", Arrays.toString(ds.columns())); - }); + streamingTestUtil + .performDPLTest( + "index=index_A | stats var(offset) AS var_offset avg(offset) AS avg_offset", testFile, ds -> { + assertEquals("[var_offset, avg_offset]", Arrays.toString(ds.columns())); + } + ); } - + // Test BY field,field @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_Agg_ByTest() { - streamingTestUtil.performDPLTest("index=index_A | stats avg(offset) AS avg_offset BY sourcetype,host", - testFile, - ds -> { + streamingTestUtil + .performDPLTest("index=index_A | stats avg(offset) AS avg_offset BY sourcetype,host", testFile, ds -> { assertEquals("[sourcetype, host, avg_offset]", Arrays.toString(ds.columns())); }); } - + // Test first() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggFirst_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats first(offset) AS first_offset", - testFile, - ds -> { - assertEquals("[first_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats first(offset) AS first_offset", testFile, ds -> { + assertEquals("[first_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("first_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("1"), destAsList); - }); + List destAsList = ds + .select("first_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("1"), destAsList); + }); } - + // Test last() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggLast_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats last(offset) AS last_offset", - testFile, - ds -> { - assertEquals("[last_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats last(offset) AS last_offset", testFile, ds -> { + assertEquals("[last_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("last_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("11"), destAsList); - }); + List destAsList = ds + .select("last_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("11"), destAsList); + }); } - + // Test latest() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggLatest_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats latest(offset) AS latest_offset", - testFile, - ds -> { - assertEquals("[latest_offset]", Arrays.toString(ds.columns())); + streamingTestUtil.performDPLTest("index=index_A | stats latest(offset) AS latest_offset", testFile, ds -> { + assertEquals("[latest_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("latest_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); - assertEquals(Collections.singletonList("11"), destAsList); - }); + List destAsList = ds + .select("latest_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); + assertEquals(Collections.singletonList("11"), destAsList); + }); } - + // Test latest_time() @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void statsTransform_AggLatestTime_Test() { - streamingTestUtil.performDPLTest("index=index_A | stats latest_time(offset) AS latest_time_offset", - testFile, - ds -> { + streamingTestUtil + .performDPLTest("index=index_A | stats latest_time(offset) AS latest_time_offset", testFile, ds -> { assertEquals("[latest_time_offset]", Arrays.toString(ds.columns())); - List destAsList = ds.select("latest_time_offset").collectAsList().stream().map(r -> r.getAs(0).toString()).collect(Collectors.toList()); + List destAsList = ds + .select("latest_time_offset") + .collectAsList() + .stream() + .map(r -> r.getAs(0).toString()) + .collect(Collectors.toList()); assertEquals(Collections.singletonList("1286709610"), destAsList); }); } diff --git a/src/test/java/com/teragrep/pth10/steps/teragrep/bloomfilter/FilterSizesTest.java b/src/test/java/com/teragrep/pth10/steps/teragrep/bloomfilter/FilterSizesTest.java index adf931f..24ef42e 100644 --- a/src/test/java/com/teragrep/pth10/steps/teragrep/bloomfilter/FilterSizesTest.java +++ b/src/test/java/com/teragrep/pth10/steps/teragrep/bloomfilter/FilterSizesTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022, 2023 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep.bloomfilter; import com.typesafe.config.Config; @@ -63,11 +62,12 @@ public void filterSizeMapTest() { Properties properties = new Properties(); - properties.put("dpl.pth_06.bloom.db.fields", "[" + - "{expected: 1000, fpp: 0.01}," + - "{expected: 2000, fpp: 0.01}," + - "{expected: 3000, fpp: 0.01}" + - "]"); + properties + .put( + "dpl.pth_06.bloom.db.fields", + "[" + "{expected: 1000, fpp: 0.01}," + "{expected: 2000, fpp: 0.01}," + + "{expected: 3000, fpp: 0.01}" + "]" + ); Config config = ConfigFactory.parseProperties(properties); FilterSizes sizeMap = new FilterSizes(config); @@ -80,32 +80,28 @@ public void filterSizeMapTest() { assertEquals(3, resultMap.size()); } + @Test public void bitSizeMapTest() { Properties properties = new Properties(); - properties.put("dpl.pth_06.bloom.db.fields", "[" + - "{expected: 1000, fpp: 0.01}," + - "{expected: 2000, fpp: 0.01}," + - "{expected: 3000, fpp: 0.01}" + - "]"); + properties + .put( + "dpl.pth_06.bloom.db.fields", + "[" + "{expected: 1000, fpp: 0.01}," + "{expected: 2000, fpp: 0.01}," + + "{expected: 3000, fpp: 0.01}" + "]" + ); Config config = ConfigFactory.parseProperties(properties); FilterSizes sizeMap = new FilterSizes(config); Map bitSizeMap = sizeMap.asBitsizeSortedMap(); - assertEquals(1000L, - bitSizeMap.get(BloomFilter.create(1000,0.01).bitSize()) - ); - assertEquals(2000L, - bitSizeMap.get(BloomFilter.create(2000,0.01).bitSize()) - ); - assertEquals(3000L, - bitSizeMap.get(BloomFilter.create(3000,0.01).bitSize()) - ); + assertEquals(1000L, bitSizeMap.get(BloomFilter.create(1000, 0.01).bitSize())); + assertEquals(2000L, bitSizeMap.get(BloomFilter.create(2000, 0.01).bitSize())); + assertEquals(3000L, bitSizeMap.get(BloomFilter.create(3000, 0.01).bitSize())); assertEquals(3, bitSizeMap.size()); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/steps/teragrep/bloomfilter/TeragrepBloomFilterTest.java b/src/test/java/com/teragrep/pth10/steps/teragrep/bloomfilter/TeragrepBloomFilterTest.java index e633590..e94de27 100644 --- a/src/test/java/com/teragrep/pth10/steps/teragrep/bloomfilter/TeragrepBloomFilterTest.java +++ b/src/test/java/com/teragrep/pth10/steps/teragrep/bloomfilter/TeragrepBloomFilterTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022, 2023 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.steps.teragrep.bloomfilter; import com.typesafe.config.Config; @@ -72,7 +71,7 @@ class TeragrepBloomFilterTest { private FilterSizes filterSizes; private final BloomFilter emptyFilter = BloomFilter.create(100, 0.01); - private SortedMap sizeMap; + private SortedMap sizeMap; @BeforeAll void setEnv() throws ClassNotFoundException, SQLException { @@ -81,11 +80,12 @@ void setEnv() throws ClassNotFoundException, SQLException { properties.put("dpl.pth_10.bloom.db.username", username); properties.put("dpl.pth_10.bloom.db.password", password); properties.put("dpl.pth_06.bloom.db.url", connectionUrl); - properties.put("dpl.pth_06.bloom.db.fields", "[" + - "{expected: 10000, fpp: 0.01}," + - "{expected: 20000, fpp: 0.03}," + - "{expected: 30000, fpp: 0.05}" + - "]"); + properties + .put( + "dpl.pth_06.bloom.db.fields", + "[" + "{expected: 10000, fpp: 0.01}," + "{expected: 20000, fpp: 0.03}," + + "{expected: 30000, fpp: 0.05}" + "]" + ); Config config = ConfigFactory.parseProperties(properties); lazyConnection = new LazyConnection(config); @@ -96,32 +96,26 @@ void setEnv() throws ClassNotFoundException, SQLException { filterSizes = new FilterSizes(config); sizeMap = filterSizes.asSortedMap(); - Class.forName ("org.h2.Driver"); + Class.forName("org.h2.Driver"); sizeMap.put(10000L, 0.01); sizeMap.put(20000L, 0.03); sizeMap.put(30000L, 0.05); - String createFilterType = - "CREATE TABLE `filtertype` (" + - "`id` bigint(20) UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY," + - "`expectedElements` bigint(20) NOT NULL," + - "`targetFpp` DOUBLE UNSIGNED NOT NULL" + - ");"; - - String createBloomFilter = - "CREATE TABLE `bloomfilter` (" + - " `id` BIGINT(20) UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY," + - " `partition_id` BIGINT(20) UNSIGNED NOT NULL," + - " `filter_type_id` BIGINT(20) UNSIGNED NOT NULL," + - " `filter` LONGBLOB NOT NULL" + - ");"; + String createFilterType = "CREATE TABLE `filtertype` (" + + "`id` bigint(20) UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY," + + "`expectedElements` bigint(20) NOT NULL," + "`targetFpp` DOUBLE UNSIGNED NOT NULL" + ");"; + + String createBloomFilter = "CREATE TABLE `bloomfilter` (" + + " `id` BIGINT(20) UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY," + + " `partition_id` BIGINT(20) UNSIGNED NOT NULL," + + " `filter_type_id` BIGINT(20) UNSIGNED NOT NULL," + " `filter` LONGBLOB NOT NULL" + ");"; String insertSql = "INSERT INTO `filtertype` (`expectedElements`, `targetFpp`) VALUES (?, ?)"; conn.prepareStatement(createFilterType).execute(); conn.prepareStatement(createBloomFilter).execute(); - for(Map.Entry entry : sizeMap.entrySet()) { + for (Map.Entry entry : sizeMap.entrySet()) { try (PreparedStatement stmt = conn.prepareStatement(insertSql)) { stmt.setInt(1, entry.getKey().intValue()); // filtertype.expectedElements @@ -149,23 +143,21 @@ void filterSaveNoOverwriteTest() { Row row = Assertions.assertDoesNotThrow(() -> generatedRow(sizeMap, tokens)); String partition = row.getString(0); byte[] filterBytes = (byte[]) row.get(1); - TeragrepBloomFilter filter = - new TeragrepBloomFilter(partition, filterBytes, lazyConnection.get(), filterSizes); + TeragrepBloomFilter filter = new TeragrepBloomFilter(partition, filterBytes, lazyConnection.get(), filterSizes); filter.saveFilter(false); - Map.Entry entry = sizeMap.entrySet().iterator().next(); + Map.Entry entry = sizeMap.entrySet().iterator().next(); String sql = "SELECT `filter` FROM `bloomfilter`;"; - ResultSet rs = Assertions.assertDoesNotThrow(() -> lazyConnection.get().prepareStatement(sql) - .executeQuery()); + ResultSet rs = Assertions.assertDoesNotThrow(() -> lazyConnection.get().prepareStatement(sql).executeQuery()); int cols = Assertions.assertDoesNotThrow(() -> rs.getMetaData().getColumnCount()); BloomFilter resultFilter = emptyFilter; - while(Assertions.assertDoesNotThrow(() -> rs.next())) { + while (Assertions.assertDoesNotThrow(() -> rs.next())) { byte[] bytes = Assertions.assertDoesNotThrow(() -> rs.getBytes(1)); ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - resultFilter = Assertions.assertDoesNotThrow(() -> BloomFilter.readFrom(bais)); + resultFilter = Assertions.assertDoesNotThrow(() -> BloomFilter.readFrom(bais)); } Assertions.assertNotNull(resultFilter); @@ -184,24 +176,20 @@ void filterSaveOverwriteTest() { Row row = Assertions.assertDoesNotThrow(() -> generatedRow(sizeMap, tokens)); String partition = row.getString(0); byte[] filterBytes = (byte[]) row.get(1); - TeragrepBloomFilter filter = - new TeragrepBloomFilter(partition, filterBytes, lazyConnection.get(), filterSizes); + TeragrepBloomFilter filter = new TeragrepBloomFilter(partition, filterBytes, lazyConnection.get(), filterSizes); filter.saveFilter(true); String sql = "SELECT `filter` FROM `bloomfilter`;"; - ResultSet rs = Assertions.assertDoesNotThrow(() -> lazyConnection - .get() - .prepareStatement(sql) - .executeQuery()); + ResultSet rs = Assertions.assertDoesNotThrow(() -> lazyConnection.get().prepareStatement(sql).executeQuery()); int cols = Assertions.assertDoesNotThrow(() -> rs.getMetaData().getColumnCount()); BloomFilter resultFilter = emptyFilter; - while(Assertions.assertDoesNotThrow(() -> rs.next())) { + while (Assertions.assertDoesNotThrow(() -> rs.next())) { byte[] bytes = Assertions.assertDoesNotThrow(() -> rs.getBytes(1)); ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - resultFilter = Assertions.assertDoesNotThrow(() -> BloomFilter.readFrom(bais)); + resultFilter = Assertions.assertDoesNotThrow(() -> BloomFilter.readFrom(bais)); } Assertions.assertNotNull(resultFilter); @@ -217,24 +205,26 @@ void filterSaveOverwriteTest() { Row secondRow = Assertions.assertDoesNotThrow(() -> generatedRow(sizeMap, secondTokens)); String secondPartition = secondRow.getString(0); byte[] secondFilterBytes = (byte[]) secondRow.get(1); - TeragrepBloomFilter secondFilter = - new TeragrepBloomFilter(secondPartition, secondFilterBytes, lazyConnection.get(), filterSizes); + TeragrepBloomFilter secondFilter = new TeragrepBloomFilter( + secondPartition, + secondFilterBytes, + lazyConnection.get(), + filterSizes + ); secondFilter.saveFilter(true); String secondSql = "SELECT `filter` FROM `bloomfilter`;"; - ResultSet secondRs = Assertions.assertDoesNotThrow(() -> lazyConnection - .get() - .prepareStatement(secondSql) - .executeQuery()); + ResultSet secondRs = Assertions + .assertDoesNotThrow(() -> lazyConnection.get().prepareStatement(secondSql).executeQuery()); int secondCols = Assertions.assertDoesNotThrow(() -> secondRs.getMetaData().getColumnCount()); BloomFilter secondResultFilter = emptyFilter; - while(Assertions.assertDoesNotThrow(() -> secondRs.next())) { + while (Assertions.assertDoesNotThrow(() -> secondRs.next())) { byte[] bytes = Assertions.assertDoesNotThrow(() -> secondRs.getBytes(1)); ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - secondResultFilter = Assertions.assertDoesNotThrow(() -> BloomFilter.readFrom(bais)); + secondResultFilter = Assertions.assertDoesNotThrow(() -> BloomFilter.readFrom(bais)); } Assertions.assertNotNull(secondResultFilter); @@ -256,8 +246,7 @@ void correctSizeSelectionTest() { Row row = Assertions.assertDoesNotThrow(() -> generatedRow(sizeMap, tokens)); String partition = row.getString(0); byte[] filterBytes = (byte[]) row.get(1); - TeragrepBloomFilter filter = - new TeragrepBloomFilter(partition, filterBytes, lazyConnection.get(), filterSizes); + TeragrepBloomFilter filter = new TeragrepBloomFilter(partition, filterBytes, lazyConnection.get(), filterSizes); filter.saveFilter(false); long size = Long.MAX_VALUE; @@ -271,18 +260,15 @@ void correctSizeSelectionTest() { Double fpp = sizeMap.get(size); String sql = "SELECT `filter` FROM `bloomfilter`;"; - ResultSet rs = Assertions.assertDoesNotThrow(() -> lazyConnection - .get() - .prepareStatement(sql) - .executeQuery()); + ResultSet rs = Assertions.assertDoesNotThrow(() -> lazyConnection.get().prepareStatement(sql).executeQuery()); int cols = Assertions.assertDoesNotThrow(() -> rs.getMetaData().getColumnCount()); BloomFilter resultFilter = emptyFilter; - while(Assertions.assertDoesNotThrow(() -> rs.next())) { + while (Assertions.assertDoesNotThrow(() -> rs.next())) { byte[] bytes = Assertions.assertDoesNotThrow(() -> rs.getBytes(1)); ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - resultFilter = Assertions.assertDoesNotThrow(() -> BloomFilter.readFrom(bais)); + resultFilter = Assertions.assertDoesNotThrow(() -> BloomFilter.readFrom(bais)); } Assertions.assertNotNull(resultFilter); @@ -300,14 +286,14 @@ private Row generatedRow(SortedMap filterMap, List tokens) long size = filterMap.lastKey(); for (long key : filterMap.keySet()) { - if (key < size && key >= tokens.size()) { + if (key < size && key >= tokens.size()) { size = key; } } BloomFilter bf = BloomFilter.create(size, filterMap.get(size)); - for(String token: tokens){ + for (String token : tokens) { bf.put(token); } @@ -316,4 +302,4 @@ private Row generatedRow(SortedMap filterMap, List tokens) return RowFactory.create("1", baos.toByteArray()); } -} \ No newline at end of file +} diff --git a/src/test/java/com/teragrep/pth10/subSearchTest.java b/src/test/java/com/teragrep/pth10/subSearchTest.java index 71f2c0a..b4c94df 100644 --- a/src/test/java/com/teragrep/pth10/subSearchTest.java +++ b/src/test/java/com/teragrep/pth10/subSearchTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,6 +61,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class subSearchTest { + private static final Logger LOGGER = LoggerFactory.getLogger(subSearchTest.class); private StreamingTestUtil streamingTestUtil; @@ -82,13 +83,16 @@ void tearDown() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void endToEndSubSearch2Test() { - String q="index = index_A [ search sourcetype= A:X:0 | top limit=1 host | fields + host]"; + String q = "index = index_A [ search sourcetype= A:X:0 | top limit=1 host | fields + host]"; String testFile = "src/test/resources/subsearchData*.json"; this.streamingTestUtil.performDPLTest(q, testFile, res -> { - String e="RLIKE(index, (?i)^index_A$)"; + String e = "RLIKE(index, (?i)^index_A$)"; // Check that sub-query get executed and result is used as query parameter assertEquals(e, this.streamingTestUtil.getCtx().getSparkQuery()); @@ -101,13 +105,16 @@ void endToEndSubSearch2Test() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void endToEndSubSearch3Test() { - String q="index = index_A [ search sourcetype= A:X:0 | top limit=3 host | fields + host]"; + String q = "index = index_A [ search sourcetype= A:X:0 | top limit=3 host | fields + host]"; String testFile = "src/test/resources/subsearchData*.json"; // * to make the path into a directory path this.streamingTestUtil.performDPLTest(q, testFile, res -> { - String e="RLIKE(index, (?i)^index_A$)"; + String e = "RLIKE(index, (?i)^index_A$)"; // Check that sub-query get executed and result is used as query parameter assertEquals(e, this.streamingTestUtil.getCtx().getSparkQuery()); @@ -115,7 +122,14 @@ void endToEndSubSearch3Test() { // Should have all the columns, fields command in subsearch shouldn't affect the main search assertEquals(9, res.columns().length); - List lst = res.select("host").distinct().orderBy("host").collectAsList().stream().map(r -> r.getString(0)).collect(Collectors.toList()); + List lst = res + .select("host") + .distinct() + .orderBy("host") + .collectAsList() + .stream() + .map(r -> r.getString(0)) + .collect(Collectors.toList()); assertEquals(2, lst.size()); @@ -126,48 +140,54 @@ void endToEndSubSearch3Test() { } @Disabled - @Test + @Test void endToEndSubSearch4Test() { - String q="index = index_A [ search sourcetype= A:X:0 | top limit=1 host | fields + host] [ search sourcetype= c:X:0| top limit=1 host | fields + host]"; + String q = "index = index_A [ search sourcetype= A:X:0 | top limit=1 host | fields + host] [ search sourcetype= c:X:0| top limit=1 host | fields + host]"; String testFile = "src/test/resources/subsearchData*.json"; // * to make the path into a directory path -// q="index = index_A [ search sourcetype= A:X:0 | top limit=1 host | fields + host] [ search host= computer03.example.com | top limit=1 host | fields + host]"; + // q="index = index_A [ search sourcetype= A:X:0 | top limit=1 host | fields + host] [ search host= computer03.example.com | top limit=1 host | fields + host]"; this.streamingTestUtil.performDPLTest(q, testFile, res -> { - String e="(`index` LIKE 'index_A' AND ((`_raw` LIKE '%computer01.example.com%' AND `_raw` LIKE '%computer02.example.com%') AND `_raw` LIKE '%computer01.example.com%'))"; + String e = "(`index` LIKE 'index_A' AND ((`_raw` LIKE '%computer01.example.com%' AND `_raw` LIKE '%computer02.example.com%') AND `_raw` LIKE '%computer01.example.com%'))"; // Check that sub-query get executed and result is used as query parameter assertEquals(e, this.streamingTestUtil.getCtx().getSparkQuery()); // Check full result - e = "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n" + - "|value |\n" + - "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n" + - "|{\"_raw\":\"127.0.0.123:4567 [26/Nov/2021:07:02:44.809] https-in~ https-in/ 0/-1/-1/-1/0 302 104 - - LR-- 1/1/0/0/0 0/0 \\\"GET /Monster_boy_normal_(entity) HTTP/1.1\\\" A:X:0 computer01.example.com computer02.example.com\",\"_time\":\"2001-01-01T01:01:01.011+03:00\",\"host\":\"computer02.example.com\",\"index\":\"index_A\",\"offset\":1,\"partition\":\"hundred-year/2001/01-01/computer01.example.com/01/01.logGLOB-2001010101.log.gz\",\"source\":\"imfile:computer01.example.com:01.log\",\"sourcetype\":\"A:X:0\"}|\n" + - "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n"; + e = "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n" + + "|value |\n" + + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n" + + "|{\"_raw\":\"127.0.0.123:4567 [26/Nov/2021:07:02:44.809] https-in~ https-in/ 0/-1/-1/-1/0 302 104 - - LR-- 1/1/0/0/0 0/0 \\\"GET /Monster_boy_normal_(entity) HTTP/1.1\\\" A:X:0 computer01.example.com computer02.example.com\",\"_time\":\"2001-01-01T01:01:01.011+03:00\",\"host\":\"computer02.example.com\",\"index\":\"index_A\",\"offset\":1,\"partition\":\"hundred-year/2001/01-01/computer01.example.com/01/01.logGLOB-2001010101.log.gz\",\"source\":\"imfile:computer01.example.com:01.log\",\"sourcetype\":\"A:X:0\"}|\n" + + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n"; String jsonStr = res.toJSON().showString(7, 0, false); assertEquals(e, jsonStr); }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void endToEndSearchTest() { - String q="sourcetype=A:X:0| top limit=2 host | fields + host"; + String q = "sourcetype=A:X:0| top limit=2 host | fields + host"; String testFile = "src/test/resources/xmlWalkerTestDataStreaming"; this.streamingTestUtil.performDPLTest(q, testFile, res -> { - String head=res.head().getString(0); + String head = res.head().getString(0); List lst = res.collectAsList(); // Correct item count - assertEquals(2,lst.size()); - assertEquals("computer01.example.com",lst.get(0).getString(0)); - assertEquals("computer02.example.com",lst.get(1).getString(0)); + assertEquals(2, lst.size()); + assertEquals("computer01.example.com", lst.get(0).getString(0)); + assertEquals("computer02.example.com", lst.get(1).getString(0)); }); } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void endToEndSearch1Test() { - String q="index = index_A AND computer01.example.com AND computer02.example.com"; + String q = "index = index_A AND computer01.example.com AND computer02.example.com"; String testFile = "src/test/resources/subsearchData*.json"; // * to make the path into a directory path this.streamingTestUtil.performDPLTest(q, testFile, res -> { @@ -177,24 +197,25 @@ void endToEndSearch1Test() { } @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) void endToEndSearch3Test() { - String q="sourcetype=c:X:0| top limit=1 host | fields + host"; + String q = "sourcetype=c:X:0| top limit=1 host | fields + host"; String testFile = "src/test/resources/subsearchData*.json"; // * to make the path into a directory path this.streamingTestUtil.performDPLTest(q, testFile, res -> { List lst = res.collectAsList(); - lst.forEach(item->{ - LOGGER.info("item value={}",item.getString(0)); + lst.forEach(item -> { + LOGGER.info("item value={}", item.getString(0)); }); // Correct item count - assertEquals(1,lst.size()); - assertEquals("computer03.example.com",lst.get(0).getString(0)); + assertEquals(1, lst.size()); + assertEquals("computer03.example.com", lst.get(0).getString(0)); boolean aggregates = this.streamingTestUtil.getCatalystVisitor().getAggregatesUsed(); assertFalse(aggregates); }); } - } - diff --git a/src/test/java/com/teragrep/pth10/syntaxErrorTest.java b/src/test/java/com/teragrep/pth10/syntaxErrorTest.java index 5306f80..ce84e52 100644 --- a/src/test/java/com/teragrep/pth10/syntaxErrorTest.java +++ b/src/test/java/com/teragrep/pth10/syntaxErrorTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -52,71 +52,72 @@ import static org.junit.jupiter.api.Assertions.assertThrows; public class syntaxErrorTest { - @Disabled(value = "Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void syntaxTest() { - String e; - final String q; - q = "index = archive_memory ( host = \"localhost\" Deny"; - e = "failed to parse at line 1:49 due to missing PARENTHESIS_R at ''"; - Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); - assertEquals(e, exception.getMessage()); - throw new UnsupportedOperationException("Implement"); - } - @Disabled(value = "Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void syntax1Test() { - String e; - final String q; + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void syntaxTest() { + String e; + final String q; + q = "index = archive_memory ( host = \"localhost\" Deny"; + e = "failed to parse at line 1:49 due to missing PARENTHESIS_R at ''"; + Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); + assertEquals(e, exception.getMessage()); + throw new UnsupportedOperationException("Implement"); + } - q = "index = archive_memory ( host = \"localhost\" OR host = \"test\" @))) < AND sourcetype = \"memory\" Deny"; - e = "failed to parse at line 1:61 due to extraneous input '@' expecting PARENTHESIS_R"; - Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); - assertEquals(e, exception.getMessage()); - throw new UnsupportedOperationException("Implement"); + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void syntax1Test() { + String e; + final String q; - } + q = "index = archive_memory ( host = \"localhost\" OR host = \"test\" @))) < AND sourcetype = \"memory\" Deny"; + e = "failed to parse at line 1:61 due to extraneous input '@' expecting PARENTHESIS_R"; + Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); + assertEquals(e, exception.getMessage()); + throw new UnsupportedOperationException("Implement"); - /** - * Now input is valid and test\"localhost is just plain string - */ - @Disabled(value = "Should be converted to a dataframe test") - @Test - public void syntax2Test() { - String e; - final String q; - q = "index = archive_memory host = test\"localhost"; - e = "failed to parse at line 1:30 due to token recognition error at: '\"localhost'"; - Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); - assertEquals(e, exception.getMessage()); - throw new UnsupportedOperationException("Implement"); + } - } + /** + * Now input is valid and test\"localhost is just plain string + */ + @Disabled(value = "Should be converted to a dataframe test") + @Test + public void syntax2Test() { + String e; + final String q; + q = "index = archive_memory host = test\"localhost"; + e = "failed to parse at line 1:30 due to token recognition error at: '\"localhost'"; + Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); + assertEquals(e, exception.getMessage()); + throw new UnsupportedOperationException("Implement"); - @Disabled(value = "Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void syntaxError3Test() { - String e; - final String q; - q = "index = \"cpu\" sourcetype=\"log:cpu:0\" host=\"sc-99-99-14-19\" OR host = \"sc-99-99-10-201\")"; - e = "failed to parse at line 1:86 due to extraneous input ')' expecting {, PIPE}"; - Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); - assertEquals(e, exception.getMessage()); - throw new UnsupportedOperationException("Implement"); + } - } + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void syntaxError3Test() { + String e; + final String q; + q = "index = \"cpu\" sourcetype=\"log:cpu:0\" host=\"sc-99-99-14-19\" OR host = \"sc-99-99-10-201\")"; + e = "failed to parse at line 1:86 due to extraneous input ')' expecting {, PIPE}"; + Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); + assertEquals(e, exception.getMessage()); + throw new UnsupportedOperationException("Implement"); - @Disabled(value = "Should be converted to a dataframe test") - @Test // disabled on 2022-05-16 TODO convert to dataframe test - public void syntax4Test() { - String q,e; - // missing parameter in IF-clause - q = "index=*,cinnamon | where if(substr(_raw,0,14)==\"127.0.0.49\",\"true\")"; - e = "failed to parse at line 1:69 due to mismatched input ')' expecting {EVAL_LANGUAGE_MODE_COMMA, EVAL_LANGUAGE_MODE_DEQ, EVAL_LANGUAGE_MODE_EQ, EVAL_LANGUAGE_MODE_NEQ, EVAL_LANGUAGE_MODE_LT, EVAL_LANGUAGE_MODE_LTE, EVAL_LANGUAGE_MODE_GT, EVAL_LANGUAGE_MODE_GTE, EVAL_LANGUAGE_MODE_DOT, EVAL_LANGUAGE_MODE_AND, EVAL_LANGUAGE_MODE_OR, EVAL_LANGUAGE_MODE_XOR, EVAL_LANGUAGE_MODE_WILDCARD, EVAL_LANGUAGE_MODE_PLUS, EVAL_LANGUAGE_MODE_MINUS, EVAL_LANGUAGE_MODE_SLASH, EVAL_LANGUAGE_MODE_Like, EVAL_LANGUAGE_MODE_PERCENT, EVAL_LANGUAGE_MODE_LIKE}"; - Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); - assertEquals(e, exception.getMessage()); - throw new UnsupportedOperationException("Implement"); - } + } + + @Disabled(value = "Should be converted to a dataframe test") + @Test // disabled on 2022-05-16 TODO convert to dataframe test + public void syntax4Test() { + String q, e; + // missing parameter in IF-clause + q = "index=*,cinnamon | where if(substr(_raw,0,14)==\"127.0.0.49\",\"true\")"; + e = "failed to parse at line 1:69 due to mismatched input ')' expecting {EVAL_LANGUAGE_MODE_COMMA, EVAL_LANGUAGE_MODE_DEQ, EVAL_LANGUAGE_MODE_EQ, EVAL_LANGUAGE_MODE_NEQ, EVAL_LANGUAGE_MODE_LT, EVAL_LANGUAGE_MODE_LTE, EVAL_LANGUAGE_MODE_GT, EVAL_LANGUAGE_MODE_GTE, EVAL_LANGUAGE_MODE_DOT, EVAL_LANGUAGE_MODE_AND, EVAL_LANGUAGE_MODE_OR, EVAL_LANGUAGE_MODE_XOR, EVAL_LANGUAGE_MODE_WILDCARD, EVAL_LANGUAGE_MODE_PLUS, EVAL_LANGUAGE_MODE_MINUS, EVAL_LANGUAGE_MODE_SLASH, EVAL_LANGUAGE_MODE_Like, EVAL_LANGUAGE_MODE_PERCENT, EVAL_LANGUAGE_MODE_LIKE}"; + Throwable exception = assertThrows(IllegalStateException.class, () -> utils.getQueryAnalysis(q)); + assertEquals(e, exception.getMessage()); + throw new UnsupportedOperationException("Implement"); + } } diff --git a/src/test/java/com/teragrep/pth10/translationTests/ChartTest.java b/src/test/java/com/teragrep/pth10/translationTests/ChartTest.java index c57447c..3dde568 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/ChartTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/ChartTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,6 +63,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class ChartTest { + @Test void testChartTranslation() { String query = "| chart count(_raw) by _time"; @@ -78,7 +79,10 @@ void testChartTranslation() { ct.visitChartTransformation((DPLParser.ChartTransformationContext) tree.getChild(1).getChild(0)); ChartStep cs = ct.chartStep; - assertEquals("[countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`]", Arrays.toString(cs.getListOfExpr().toArray())); + assertEquals( + "[countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`]", + Arrays.toString(cs.getListOfExpr().toArray()) + ); assertEquals("[_time]", Arrays.toString(cs.getListOfGroupBy().toArray())); } @@ -98,7 +102,10 @@ void testChartTranslation_multiGroupBy() { ct.visitChartTransformation((DPLParser.ChartTransformationContext) tree.getChild(1).getChild(0)); ChartStep cs = ct.chartStep; - assertEquals("[countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`]", Arrays.toString(cs.getListOfExpr().toArray())); + assertEquals( + "[countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`]", + Arrays.toString(cs.getListOfExpr().toArray()) + ); assertEquals("[_time, fieldTwo]", Arrays.toString(cs.getListOfGroupBy().toArray())); } @@ -117,9 +124,11 @@ void testChartTranslation_multiGroupByNoComma() { ct.visitChartTransformation((DPLParser.ChartTransformationContext) tree.getChild(1).getChild(0)); ChartStep cs = ct.chartStep; - assertEquals("[countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`]", Arrays.toString(cs.getListOfExpr().toArray())); + assertEquals( + "[countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`]", + Arrays.toString(cs.getListOfExpr().toArray()) + ); assertEquals("[fieldTwo, _time]", Arrays.toString(cs.getListOfGroupBy().toArray())); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/ConvertTest.java b/src/test/java/com/teragrep/pth10/translationTests/ConvertTest.java index 0b441d6..2be029d 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/ConvertTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/ConvertTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -137,4 +137,3 @@ void testConvertTranslation3() { } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/DedupTest.java b/src/test/java/com/teragrep/pth10/translationTests/DedupTest.java index cd2f198..5082c6c 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/DedupTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/DedupTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -46,7 +46,6 @@ package com.teragrep.pth10.translationTests; import com.teragrep.pth10.ast.DPLParserCatalystContext; -import com.teragrep.pth10.ast.DPLParserCatalystVisitor; import com.teragrep.pth10.ast.commands.transformstatement.DedupTransformation; import com.teragrep.pth10.steps.dedup.DedupStep; import com.teragrep.pth_03.antlr.DPLLexer; @@ -66,7 +65,9 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class DedupTest { + private static final Logger LOGGER = LoggerFactory.getLogger(DedupTest.class); + @Test void testDedupTranslation() { String query = "| dedup fieldOne, fieldTwo"; @@ -112,7 +113,7 @@ void testDedupTranslationWithMaxDuplicatesParam() { } - @Test + @Test void testDedupTranslationWithConsecutiveParam() { String query = "| dedup fieldOne, fieldTwo consecutive=true"; CharStream inputStream = CharStreams.fromString(query); @@ -135,7 +136,7 @@ void testDedupTranslationWithConsecutiveParam() { assertTrue(ds.getConsecutive()); } - @Test + @Test void testDedupTranslationWithKeepEmptyParam() { String query = "| dedup fieldOne, fieldTwo keepempty=true"; CharStream inputStream = CharStreams.fromString(query); @@ -159,7 +160,7 @@ void testDedupTranslationWithKeepEmptyParam() { } - @Test + @Test void testDedupTranslationWithKeepEventsParam() { String query = "| dedup fieldOne, fieldTwo keepevents=true"; CharStream inputStream = CharStreams.fromString(query); @@ -205,4 +206,3 @@ void testDedupTranslationWithSortbyParam() { // TODO add assertion for sort by clause } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/DplTest.java b/src/test/java/com/teragrep/pth10/translationTests/DplTest.java index 2116ec6..3d7bb4c 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/DplTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/DplTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -64,6 +64,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class DplTest { + @Test void testDplTranslation() { String query = "| dpl parsetree "; @@ -84,4 +85,3 @@ void testDplTranslation() { assertEquals("[]", Arrays.toString(cs.getLines().toArray())); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/EvalTest.java b/src/test/java/com/teragrep/pth10/translationTests/EvalTest.java index c47ed27..0d4eff2 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/EvalTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/EvalTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -62,18 +62,17 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class EvalTest { + SparkSession spark = null; DPLParserCatalystContext ctx = null; + @org.junit.jupiter.api.BeforeAll void setEnv() { - spark = SparkSession - .builder() - .appName("Java Spark SQL basic example") - .master("local[2]") - .getOrCreate(); + spark = SparkSession.builder().appName("Java Spark SQL basic example").master("local[2]").getOrCreate(); spark.sparkContext().setLogLevel("ERROR"); ctx = new DPLParserCatalystContext(spark); } + @Test void testEvalTranslation() { final String query = "| eval a = abs(-3)"; @@ -89,7 +88,7 @@ void testEvalTranslation() { ct.visitEvalTransformation((DPLParser.EvalTransformationContext) tree.getChild(1).getChild(0)); final EvalStep cs = ct.evalStatement.evalStep; - assertEquals("a",cs.getLeftSide()); + assertEquals("a", cs.getLeftSide()); assertEquals("abs(-3)", cs.getRightSide().toString()); } @@ -109,8 +108,8 @@ void testEvalTranslation2() { ct.visitEvalTransformation((DPLParser.EvalTransformationContext) tree.getChild(1).getChild(0)); final EvalStep cs = ct.evalStatement.evalStep; - assertEquals("a",cs.getLeftSide()); - assertEquals("EvalArithmetic(EvalArithmetic(3, +, 4), *, 7)",cs.getRightSide().toString()); + assertEquals("a", cs.getLeftSide()); + assertEquals("EvalArithmetic(EvalArithmetic(3, +, 4), *, 7)", cs.getRightSide().toString()); } @Test @@ -128,8 +127,7 @@ void testEvalTranslation3() { ct.visitEvalTransformation((DPLParser.EvalTransformationContext) tree.getChild(1).getChild(0)); final EvalStep cs = ct.evalStatement.evalStep; - assertEquals("a",cs.getLeftSide()); + assertEquals("a", cs.getLeftSide()); assertEquals("string", cs.getRightSide().toString()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/ExplainTest.java b/src/test/java/com/teragrep/pth10/translationTests/ExplainTest.java index a4778bd..9faaba3 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/ExplainTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/ExplainTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -99,4 +99,3 @@ void testExplainTranslation2() { assertEquals(AbstractExplainStep.ExplainMode.EXTENDED, cs.getMode()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/FieldsTest.java b/src/test/java/com/teragrep/pth10/translationTests/FieldsTest.java index 8edf443..3684cb8 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/FieldsTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/FieldsTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -66,7 +66,9 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class FieldsTest { + private static final Logger LOGGER = LoggerFactory.getLogger(FieldsTest.class); + @Test void testFieldsTranslation() { String query = "| fields + _raw, _time, offset "; @@ -109,4 +111,3 @@ void testFieldsTranslation2() { assertEquals("[_raw, _time, offset]", Arrays.toString(cs.getListOfFields().toArray())); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/IplocationTest.java b/src/test/java/com/teragrep/pth10/translationTests/IplocationTest.java index 3f2e13d..63b4676 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/IplocationTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/IplocationTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -64,6 +64,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class IplocationTest { + private static final Logger LOGGER = LoggerFactory.getLogger(IplocationTest.class); @Test diff --git a/src/test/java/com/teragrep/pth10/translationTests/JoinTest.java b/src/test/java/com/teragrep/pth10/translationTests/JoinTest.java index 2e42fdf..4052e22 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/JoinTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/JoinTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -64,12 +64,13 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class JoinTest { + /* String joinMode = "inner"; - Boolean usetime = false; - Boolean earlier = true; - Boolean overwrite = true; - Integer max = 1; + Boolean usetime = false; + Boolean earlier = true; + Boolean overwrite = true; + Integer max = 1; */ @Test void testJoinTranslation() { @@ -84,7 +85,6 @@ void testJoinTranslation() { DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); - JoinTransformation ct = new JoinTransformation(visitor, ctx); ct.visitJoinTransformation((DPLParser.JoinTransformationContext) tree.getChild(1).getChild(0)); JoinStep cs = ct.joinStep; @@ -124,4 +124,3 @@ void testJoinTranslation2() { assertFalse(cs.getUsetime()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/MakeresultsTest.java b/src/test/java/com/teragrep/pth10/translationTests/MakeresultsTest.java index 6a849ae..e9b64bc 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/MakeresultsTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/MakeresultsTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -82,7 +82,7 @@ void testMakeresultsTranslation() { } @Test - void testMakeresultsTranslation2() { + void testMakeresultsTranslation2() { String query = " | makeresults "; CharStream inputStream = CharStreams.fromString(query); DPLLexer lexer = new DPLLexer(inputStream); @@ -100,4 +100,3 @@ void testMakeresultsTranslation2() { assertFalse(cs.isAnnotate()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/PredictTest.java b/src/test/java/com/teragrep/pth10/translationTests/PredictTest.java index 0955dae..68fd92e 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/PredictTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/PredictTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -43,7 +43,6 @@ * Teragrep, the applicable Commercial License may apply to this file if you as * a licensee so wish it. */ - package com.teragrep.pth10.translationTests; import com.teragrep.pth10.ast.DPLParserCatalystContext; @@ -60,6 +59,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; public class PredictTest { + @Test void testPredictTranslation_Basic() { String query = " | predict field"; diff --git a/src/test/java/com/teragrep/pth10/translationTests/RegexTest.java b/src/test/java/com/teragrep/pth10/translationTests/RegexTest.java index 0ceff43..3564fa4 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/RegexTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/RegexTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -103,4 +103,3 @@ void testRegexTranslation2() { assertTrue(cs.isEquals()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/RenameTest.java b/src/test/java/com/teragrep/pth10/translationTests/RenameTest.java index 88d7322..ef1a3fb 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/RenameTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/RenameTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,6 +61,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class RenameTest { + @Test void testRenameTranslation() { String query = " | rename field as new"; @@ -97,4 +98,3 @@ void testRenameWithQuotesTranslation() { assertEquals("new name", cs.getMapOfRenamedFields().get("field")); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/ReplaceTest.java b/src/test/java/com/teragrep/pth10/translationTests/ReplaceTest.java index 176b8ee..6683f99 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/ReplaceTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/ReplaceTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -130,4 +130,3 @@ void testReplaceTranslationMultipleWiths() { assertEquals("[_raw, anotherField]", Arrays.toString(cs.listOfFields().toArray())); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/Rex4jTest.java b/src/test/java/com/teragrep/pth10/translationTests/Rex4jTest.java index 5f56875..5d08aba 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/Rex4jTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/Rex4jTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,8 +61,9 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class Rex4jTest { + @Test - void testRex4jTranslation() { + void testRex4jTranslation() { String query = " | rex4j field=host mode=sed \"s/from/to/g\""; CharStream inputStream = CharStreams.fromString(query); DPLLexer lexer = new DPLLexer(inputStream); @@ -145,4 +146,3 @@ void testRex4jTranslation_ExtractionMode() { assertNull(cs.getMaxMatch()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/RexTest.java b/src/test/java/com/teragrep/pth10/translationTests/RexTest.java index cc62f5f..a9b121d 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/RexTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/RexTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 diff --git a/src/test/java/com/teragrep/pth10/translationTests/SendemailTest.java b/src/test/java/com/teragrep/pth10/translationTests/SendemailTest.java index e0bb979..b7159c7 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/SendemailTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/SendemailTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -83,7 +83,8 @@ void testSendemailTranslation() { assertNotNull(cs.getSendemailResultsProcessor()); Map params = cs.getSendemailResultsProcessor().getParameters(); - Map expected = buildParamMap(new Object() {}.getClass().getEnclosingMethod().getName()); + Map expected = buildParamMap(new Object() { + }.getClass().getEnclosingMethod().getName()); assertEquals(expected.size(), params.size()); for (Map.Entry ent : params.entrySet()) { @@ -113,7 +114,8 @@ void testSendemailTranslation2() { assertNotNull(cs.getSendemailResultsProcessor()); Map params = cs.getSendemailResultsProcessor().getParameters(); - Map expected = buildParamMap(new Object() {}.getClass().getEnclosingMethod().getName()); + Map expected = buildParamMap(new Object() { + }.getClass().getEnclosingMethod().getName()); assertEquals(expected.size(), params.size()); for (Map.Entry ent : params.entrySet()) { @@ -126,8 +128,9 @@ void testSendemailTranslation2() { } /** - * Build a map of all the parameters in the SendemailProcessor for testing - * based on the given test method name - these are the expected values. + * Build a map of all the parameters in the SendemailProcessor for testing based on the given test method name - + * these are the expected values. + * * @param testName name of test method * @return map of expected parameter values */ @@ -193,4 +196,3 @@ else if (testName.equals("testSendemailTranslation2")) { return params; } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/SortTest.java b/src/test/java/com/teragrep/pth10/translationTests/SortTest.java index cf122e6..45ec83c 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/SortTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/SortTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -68,7 +68,9 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class SortTest { + private static final Logger LOGGER = LoggerFactory.getLogger(SortTest.class); + @Test void testSortTranslation() { String query = "| sort +num(offset)"; @@ -150,7 +152,10 @@ void testSortTranslation2() { // build list from expected List expected = new ArrayList<>(); - expected.add(sbc); expected.add(sbc2); expected.add(sbc3); expected.add(sbc4); + expected.add(sbc); + expected.add(sbc2); + expected.add(sbc3); + expected.add(sbc4); for (int i = 0; i < cs.getListOfSortByClauses().size(); i++) { SortByClause fromRun = cs.getListOfSortByClauses().get(i); @@ -165,4 +170,3 @@ void testSortTranslation2() { assertEquals(1234, cs.getLimit()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/SpathTest.java b/src/test/java/com/teragrep/pth10/translationTests/SpathTest.java index 40d527a..e362694 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/SpathTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/SpathTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,6 +61,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class SpathTest { + @Test void testSpathTranslation() { final String query = "| spath input=_raw output=out path=this.is.a.path"; @@ -76,10 +77,10 @@ void testSpathTranslation() { ct.visitSpathTransformation((DPLParser.SpathTransformationContext) tree.getChild(1).getChild(0)); final SpathStep cs = ct.spathStep; - assertEquals("this.is.a.path",cs.getPath()); - assertEquals("_raw", cs.getInputColumn()); - assertFalse(cs.getAutoExtractionMode()); - assertEquals("out", cs.getOutputColumn()); + assertEquals("this.is.a.path", cs.getPath()); + assertEquals("_raw", cs.getInputColumn()); + assertFalse(cs.getAutoExtractionMode()); + assertEquals("out", cs.getOutputColumn()); } @Test @@ -105,4 +106,3 @@ void testSpathTranslation2() { assertEquals("$$dpl_pth10_internal_column_spath_output$$", cs.getOutputColumn()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/StatsTest.java b/src/test/java/com/teragrep/pth10/translationTests/StatsTest.java index 3dedddf..d9e2754 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/StatsTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/StatsTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -61,6 +61,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class StatsTest { + @Test void testStatsTranslation() { final String query = "| stats count(_raw) by _time"; @@ -76,7 +77,10 @@ void testStatsTranslation() { ct.visitStatsTransformation((DPLParser.StatsTransformationContext) tree.getChild(1).getChild(0)); final StatsStep cs = ct.statsStep; - assertEquals("countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`",cs.getListOfAggregationExpressions().get(0).toString()); + assertEquals( + "countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`", + cs.getListOfAggregationExpressions().get(0).toString() + ); assertEquals("_time", cs.getListOfGroupBys().get(0).toString()); } @@ -96,8 +100,11 @@ void testStatsTranslation2() { ct.visitStatsTransformation((DPLParser.StatsTransformationContext) tree.getChild(1).getChild(0)); final StatsStep cs = ct.statsStep; - assertEquals("countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`",cs.getListOfAggregationExpressions().get(0).toString()); - assertEquals("avg(_raw) AS `avg(_raw)`",cs.getListOfAggregationExpressions().get(1).toString()); + assertEquals( + "countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`", + cs.getListOfAggregationExpressions().get(0).toString() + ); + assertEquals("avg(_raw) AS `avg(_raw)`", cs.getListOfAggregationExpressions().get(1).toString()); assertEquals("_time", cs.getListOfGroupBys().get(0).toString()); } @@ -116,9 +123,11 @@ void testStatsTranslation3() { ct.visitStatsTransformation((DPLParser.StatsTransformationContext) tree.getChild(1).getChild(0)); final StatsStep cs = ct.statsStep; - assertEquals("countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`",cs.getListOfAggregationExpressions().get(0).toString()); - assertEquals("avg(_raw) AS `avg(_raw)`",cs.getListOfAggregationExpressions().get(1).toString()); + assertEquals( + "countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS `count(_raw)`", + cs.getListOfAggregationExpressions().get(0).toString() + ); + assertEquals("avg(_raw) AS `avg(_raw)`", cs.getListOfAggregationExpressions().get(1).toString()); assertEquals(0, cs.getListOfGroupBys().size()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/StrcatTest.java b/src/test/java/com/teragrep/pth10/translationTests/StrcatTest.java index b9288e0..e23d461 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/StrcatTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/StrcatTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -63,6 +63,7 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class StrcatTest { + @Test void testStrcatTranslation() { final String query = "| strcat \"Hello\" \"World\" hello_world"; @@ -83,4 +84,3 @@ void testStrcatTranslation() { assertEquals(2, cs.getNumberOfSrcFieldsOriginally()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/TableTest.java b/src/test/java/com/teragrep/pth10/translationTests/TableTest.java index c1de2e0..3a6c040 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/TableTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/TableTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -82,4 +82,3 @@ void testTableTranslation() { assertEquals("[a, b, c, d]", Arrays.toString(cs.getListOfFields().toArray())); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/TeragrepTest.java b/src/test/java/com/teragrep/pth10/translationTests/TeragrepTest.java index 6e1f0bd..9b4e4d0 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/TeragrepTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/TeragrepTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -71,7 +71,9 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class TeragrepTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TeragrepTest.class); + @Test void testTeragrepTranslation() { final String query = "| teragrep exec syslog stream host 127.0.0.123 port 1337"; @@ -85,7 +87,8 @@ void testTeragrepTranslation() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); AbstractStep step = stepNode.get(); assertEquals(TeragrepSyslogStep.class, step.getClass()); @@ -108,7 +111,8 @@ void testTeragrepDefaultParamsTranslation() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); final AbstractStep step = stepNode.get(); assertEquals(TeragrepSyslogStep.class, step.getClass()); @@ -141,7 +145,8 @@ void testTeragrepSyslogConfigTranslation() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); final AbstractStep step = stepNode.get(); assertEquals(TeragrepSyslogStep.class, step.getClass()); @@ -164,7 +169,8 @@ void testTeragrepHdfsSaveTranslation() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); final AbstractStep step = stepNode.get(); assertEquals(TeragrepHdfsSaveStep.class, step.getClass()); @@ -189,7 +195,8 @@ void testTeragrepHdfsSaveRetentionTranslation() { LOGGER.debug(tree.toStringTree(parser)); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); final AbstractStep step = stepNode.get(); assertEquals(TeragrepHdfsSaveStep.class, step.getClass()); @@ -214,7 +221,8 @@ void testTeragrepHdfsSaveOverwriteTranslation() { LOGGER.debug(tree.toStringTree(parser)); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); final AbstractStep step = stepNode.get(); assertEquals(TeragrepHdfsSaveStep.class, step.getClass()); @@ -238,7 +246,8 @@ void testTeragrepHdfsLoadTranslation() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); final AbstractStep step = stepNode.get(); assertEquals(TeragrepHdfsLoadStep.class, step.getClass()); @@ -260,7 +269,8 @@ void testTeragrepTranslationKafkaSave() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); final AbstractStep step = stepNode.get(); assertEquals(TeragrepKafkaStep.class, step.getClass()); @@ -282,7 +292,8 @@ void testTeragrepHdfsListTranslation() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); AbstractStep step = stepNode.get(); assertEquals(TeragrepHdfsListStep.class, step.getClass()); @@ -304,7 +315,8 @@ void testTeragrepHdfsDeleteTranslation() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepNode stepNode = (StepNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepNode stepNode = (StepNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); AbstractStep step = stepNode.get(); assertEquals(TeragrepHdfsDeleteStep.class, step.getClass()); @@ -326,7 +338,8 @@ void testTeragrepBloomCreateTranslation() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepListNode stepNode = (StepListNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepListNode stepNode = (StepListNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); final AbstractStep step = stepNode.asList().get(0); // first is an aggregation step final AbstractStep step1 = stepNode.asList().get(1); // second is bloom create @@ -352,7 +365,8 @@ void testTeragrepBloomUpdateTranslation() { final DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); final TeragrepTransformation ct = new TeragrepTransformation(ctx, visitor); - StepListNode stepNode = (StepListNode) ct.visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); + StepListNode stepNode = (StepListNode) ct + .visitTeragrepTransformation((DPLParser.TeragrepTransformationContext) tree.getChild(1).getChild(0)); final AbstractStep step = stepNode.asList().get(0); // first is an aggregation step final AbstractStep step1 = stepNode.asList().get(1); // second is bloom update @@ -365,4 +379,3 @@ void testTeragrepBloomUpdateTranslation() { assertEquals(TeragrepBloomStep.BloomMode.UPDATE, bloomStep1.mode); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/TimechartTest.java b/src/test/java/com/teragrep/pth10/translationTests/TimechartTest.java index 2bb4307..9df2c47 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/TimechartTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/TimechartTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -64,7 +64,9 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class TimechartTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TimechartTest.class); + @Test void testTimeChartTranslation() { String query = "| timechart span=5min sum(sales) as sales by product"; @@ -86,7 +88,10 @@ void testTimeChartTranslation() { TimechartStep tcs = tct.timechartStep; assertEquals("window(_time, 300000000, 300000000, 0) AS window", tcs.getSpan().toString()); - assertEquals("sumaggregator(encodeusingserializer(input[0, java.lang.Object, true], false) AS value, decodeusingserializer(input[0, binary, true], com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.SumBuffer, false), staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false, true)) AS `sum(sales)` AS sales", tcs.getAggCols().get(0).toString()); + assertEquals( + "sumaggregator(encodeusingserializer(input[0, java.lang.Object, true], false) AS value, decodeusingserializer(input[0, binary, true], com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.SumBuffer, false), staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false, true)) AS `sum(sales)` AS sales", + tcs.getAggCols().get(0).toString() + ); assertEquals("product", tcs.getDivByInsts().get(0)); } @@ -108,7 +113,10 @@ void testTimeChartTranslation_NoByClause() { TimechartStep tcs = tct.timechartStep; assertEquals("window(_time, 300000000, 300000000, 0) AS window", tcs.getSpan().toString()); - assertEquals("sumaggregator(encodeusingserializer(input[0, java.lang.Object, true], false) AS value, decodeusingserializer(input[0, binary, true], com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.SumBuffer, false), staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false, true)) AS `sum(sales)` AS sales", tcs.getAggCols().get(0).toString()); + assertEquals( + "sumaggregator(encodeusingserializer(input[0, java.lang.Object, true], false) AS value, decodeusingserializer(input[0, binary, true], com.teragrep.pth10.ast.commands.aggregate.UDAFs.BufferClasses.SumBuffer, false), staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false, true)) AS `sum(sales)` AS sales", + tcs.getAggCols().get(0).toString() + ); assertEquals(0, tcs.getDivByInsts().size()); } @@ -130,8 +138,10 @@ void testTimeChartTranslationBasic() { TimechartStep tcs = tct.timechartStep; assertEquals("window(_time, 86400000000, 86400000000, 0) AS window", tcs.getSpan().toString()); - assertEquals("countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS count", tcs.getAggCols().get(0).toString()); + assertEquals( + "countaggregator(input[0, java.lang.Long, true].longValue AS value, staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, input[0, bigint, true], true, false, true), input[0, java.lang.Long, true].longValue) AS count", + tcs.getAggCols().get(0).toString() + ); assertEquals(0, tcs.getDivByInsts().size()); } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/TopTest.java b/src/test/java/com/teragrep/pth10/translationTests/TopTest.java index 8078296..f6de421 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/TopTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/TopTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -46,7 +46,6 @@ package com.teragrep.pth10.translationTests; import com.teragrep.pth10.ast.DPLParserCatalystContext; -import com.teragrep.pth10.ast.DPLParserCatalystVisitor; import com.teragrep.pth10.ast.commands.transformstatement.TopTransformation; import com.teragrep.pth10.steps.top.TopStep; import com.teragrep.pth_03.antlr.DPLLexer; @@ -58,12 +57,11 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInstance; -import java.util.ArrayList; - import static org.junit.jupiter.api.Assertions.assertEquals; @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class TopTest { + @Test void testTopTranslation() { final String query = " | top _raw 10"; @@ -84,4 +82,3 @@ void testTopTranslation() { } } - diff --git a/src/test/java/com/teragrep/pth10/translationTests/WhereTest.java b/src/test/java/com/teragrep/pth10/translationTests/WhereTest.java index 182f600..87f68db 100644 --- a/src/test/java/com/teragrep/pth10/translationTests/WhereTest.java +++ b/src/test/java/com/teragrep/pth10/translationTests/WhereTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -62,18 +62,17 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class WhereTest { + SparkSession spark = null; DPLParserCatalystContext ctx = null; + @org.junit.jupiter.api.BeforeAll void setEnv() { - spark = SparkSession - .builder() - .appName("Java Spark SQL basic example") - .master("local[2]") - .getOrCreate(); + spark = SparkSession.builder().appName("Java Spark SQL basic example").master("local[2]").getOrCreate(); spark.sparkContext().setLogLevel("ERROR"); ctx = new DPLParserCatalystContext(spark); } + @Test void testWhereTranslation() { final String query = " | where offset < 5"; @@ -86,7 +85,9 @@ void testWhereTranslation() { ct.visitWhereTransformation((DPLParser.WhereTransformationContext) tree.getChild(1).getChild(0)); final WhereStep cs = ct.whereStep; - assertEquals("EvalOperation(offset, " + DPLLexer.EVAL_LANGUAGE_MODE_LT + ", 5)", cs.getWhereColumn().toString()); + assertEquals( + "EvalOperation(offset, " + DPLLexer.EVAL_LANGUAGE_MODE_LT + ", 5)", cs.getWhereColumn().toString() + ); } @Test @@ -119,7 +120,9 @@ void testWhereTranslation3() { ct.visitWhereTransformation((DPLParser.WhereTransformationContext) tree.getChild(1).getChild(0)); final WhereStep cs = ct.whereStep; - assertEquals("EvalOperation(offset, " + DPLLexer.EVAL_LANGUAGE_MODE_EQ + ", 5)", cs.getWhereColumn().toString()); + assertEquals( + "EvalOperation(offset, " + DPLLexer.EVAL_LANGUAGE_MODE_EQ + ", 5)", cs.getWhereColumn().toString() + ); } @Test @@ -139,4 +142,3 @@ void testWhereTranslation4() { assertEquals("(NOT field LIKE %40%)", cs.getWhereColumn().toString()); } } - diff --git a/src/test/java/com/teragrep/pth10/utils.java b/src/test/java/com/teragrep/pth10/utils.java index 4a27881..27894de 100644 --- a/src/test/java/com/teragrep/pth10/utils.java +++ b/src/test/java/com/teragrep/pth10/utils.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -65,95 +65,98 @@ import java.util.ArrayList; import java.util.UUID; - public class utils { - private static final Logger LOGGER = LoggerFactory.getLogger(utils.class); - public static Dataset executeQueryWithCatalystOutput(String str, SparkSession spark, Dataset testSet) { - // TODO change to streaming mode - // initializing DPLParserCatalystContext with existing dataset -> processing will not be streaming - DPLParserCatalystContext ctx = new DPLParserCatalystContext(spark, testSet); - CharStream inputStream = CharStreams.fromString(str); - DPLLexer lexer = new DPLLexer(inputStream); - DPLParser parser = new DPLParser(new CommonTokenStream(lexer)); - parser.addErrorListener(new BaseErrorListener() { - @Override - public void syntaxError(Recognizer recognizer, Object offendingSymbol, int line, int charPositionInLine, String msg, RecognitionException e) { - throw new IllegalStateException("failed to parse at line " + line + ":" + charPositionInLine + " due to " + msg, e); - } - }); - ParseTree tree = parser.root(); + private static final Logger LOGGER = LoggerFactory.getLogger(utils.class); + + public static Dataset executeQueryWithCatalystOutput(String str, SparkSession spark, Dataset testSet) { + // TODO change to streaming mode + // initializing DPLParserCatalystContext with existing dataset -> processing will not be streaming + DPLParserCatalystContext ctx = new DPLParserCatalystContext(spark, testSet); + CharStream inputStream = CharStreams.fromString(str); + DPLLexer lexer = new DPLLexer(inputStream); + DPLParser parser = new DPLParser(new CommonTokenStream(lexer)); + parser.addErrorListener(new BaseErrorListener() { + + @Override + public void syntaxError( + Recognizer recognizer, + Object offendingSymbol, + int line, + int charPositionInLine, + String msg, + RecognitionException e + ) { + throw new IllegalStateException( + "failed to parse at line " + line + ":" + charPositionInLine + " due to " + msg, + e + ); + } + }); + ParseTree tree = parser.root(); + + ctx.setEarliest("-1Y"); + com.teragrep.pth10.ast.DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); + try { + CatalystNode rv = (CatalystNode) visitor.visit(tree); + return rv.getDataset(); + } + catch (Exception e) { + e.printStackTrace(); + throw e; + } + } - ctx.setEarliest("-1Y"); - com.teragrep.pth10.ast.DPLParserCatalystVisitor visitor = new DPLParserCatalystVisitor(ctx); - try { - CatalystNode rv = (CatalystNode) visitor.visit(tree); - return rv.getDataset(); - } catch (Exception e) { - e.printStackTrace(); - throw e; - } - } + public static boolean isUUID(String uuid) { + try { + UUID id = UUID.fromString(uuid.replace("_", "-")); + } + catch (IllegalArgumentException ex) { + // Not uuid, + LOGGER.debug("NOT UUID: <{}>", uuid); + return false; + } + return true; + } - public static boolean isUUID(String uuid) { - try { - UUID id = UUID.fromString(uuid.replace("_", "-")); - } catch (IllegalArgumentException ex) { - // Not uuid, - LOGGER.debug("NOT UUID: <{}>", uuid); - return false; - } - return true; - } - public static void printDebug(String e, String result){ - LOGGER.debug("Spark SQL=<{}>", result); - LOGGER.debug("Spark EXP=<{}>", e); - LOGGER.debug("----------------------"); - } + public static void printDebug(String e, String result) { + LOGGER.debug("Spark SQL=<{}>", result); + LOGGER.debug("Spark EXP=<{}>", e); + LOGGER.debug("----------------------"); + } - public static String getQueryAnalysis(String str) { - StructType exampleSchema = new StructType( - new StructField[]{ - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("origin", DataTypes.StringType, false, new MetadataBuilder().build()) - } - ); + public static String getQueryAnalysis(String str) { + StructType exampleSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("origin", DataTypes.StringType, false, new MetadataBuilder().build()) + }); - ArrayList rowArrayList = new ArrayList<>(); - Row row = RowFactory.create( - Timestamp.from(Instant.ofEpochSecond(0L)), - "test data ", - "test_index", - "test:sourcetype:0", - "test.host.domain.example.com", - "source:test", - "partition/test/0", - 0L, - "test origin" - ); - rowArrayList.add(row); + ArrayList rowArrayList = new ArrayList<>(); + Row row = RowFactory + .create(Timestamp.from(Instant.ofEpochSecond(0L)), "test data ", "test_index", "test:sourcetype:0", "test.host.domain.example.com", "source:test", "partition/test/0", 0L, "test origin"); + rowArrayList.add(row); - SparkSession sparkSession = SparkSession.builder().master("local[*]").getOrCreate(); - sparkSession = sparkSession.newSession(); - sparkSession.sparkContext().setLogLevel("ERROR"); - Dataset rowDataset = sparkSession.createDataFrame(rowArrayList, exampleSchema); - Dataset rv = executeQueryWithCatalystOutput(str, sparkSession, rowDataset); + SparkSession sparkSession = SparkSession.builder().master("local[*]").getOrCreate(); + sparkSession = sparkSession.newSession(); + sparkSession.sparkContext().setLogLevel("ERROR"); + Dataset rowDataset = sparkSession.createDataFrame(rowArrayList, exampleSchema); + Dataset rv = executeQueryWithCatalystOutput(str, sparkSession, rowDataset); - // returning canonicalized plan because the one with column names - // contains references to column instance which increment on each - // execution and therefore are not valid. check mapping of names: - // with rv.queryExecution().analyzed().canonicalized().numberedTreeString(): - // 03 +- LocalRelation [none#0, none#1, none#2, none#3, none#4, none#5, none#6, none#7L, none#8] - // with rv.queryExecution().analyzed().numberedTreeString(): - // 03 +- LocalRelation [_time#439, _raw#440, index#441, sourcetype#442, host#443, source#444, partition#445, offset#446L, origin#447] - return rv.queryExecution().analyzed().canonicalized().numberedTreeString(); - } + // returning canonicalized plan because the one with column names + // contains references to column instance which increment on each + // execution and therefore are not valid. check mapping of names: + // with rv.queryExecution().analyzed().canonicalized().numberedTreeString(): + // 03 +- LocalRelation [none#0, none#1, none#2, none#3, none#4, none#5, none#6, none#7L, none#8] + // with rv.queryExecution().analyzed().numberedTreeString(): + // 03 +- LocalRelation [_time#439, _raw#440, index#441, sourcetype#442, host#443, source#444, partition#445, offset#446L, origin#447] + return rv.queryExecution().analyzed().canonicalized().numberedTreeString(); + } } diff --git a/src/test/java/com/teragrep/pth10/whereTest.java b/src/test/java/com/teragrep/pth10/whereTest.java index 44ce0cf..5e725d2 100644 --- a/src/test/java/com/teragrep/pth10/whereTest.java +++ b/src/test/java/com/teragrep/pth10/whereTest.java @@ -1,6 +1,6 @@ /* - * Teragrep DPL to Catalyst Translator PTH-10 - * Copyright (C) 2019, 2020, 2021, 2022 Suomen Kanuuna Oy + * Teragrep Data Processing Language (DPL) translator for Apache Spark (pth_10) + * Copyright (C) 2019-2024 Suomen Kanuuna Oy * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -13,7 +13,7 @@ * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * along with this program. If not, see . * * * Additional permission under GNU Affero General Public License version 3 @@ -65,276 +65,246 @@ @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class whereTest { - private static final Logger LOGGER = LoggerFactory.getLogger(whereTest.class); + private static final Logger LOGGER = LoggerFactory.getLogger(whereTest.class); + // proper tests -v ---------------------------------------- - // proper tests -v ---------------------------------------- + private final String testFile = "src/test/resources/regexTransformationTest_data*.json"; // * to make the path into a directory path + private final StructType testSchema = new StructType(new StructField[] { + new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), + new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), + new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), + new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) + }); - private final String testFile = "src/test/resources/regexTransformationTest_data*.json"; // * to make the path into a directory path - private final StructType testSchema = new StructType( - new StructField[] { - new StructField("_time", DataTypes.TimestampType, false, new MetadataBuilder().build()), - new StructField("id", DataTypes.LongType, false, new MetadataBuilder().build()), - new StructField("_raw", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("index", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("sourcetype", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("host", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("source", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("partition", DataTypes.StringType, false, new MetadataBuilder().build()), - new StructField("offset", DataTypes.LongType, false, new MetadataBuilder().build()) - } - ); + private StreamingTestUtil streamingTestUtil; - private StreamingTestUtil streamingTestUtil; + @org.junit.jupiter.api.BeforeAll + void setEnv() { + this.streamingTestUtil = new StreamingTestUtil(this.testSchema); + this.streamingTestUtil.setEnv(); + } - @org.junit.jupiter.api.BeforeAll - void setEnv() { - this.streamingTestUtil = new StreamingTestUtil(this.testSchema); - this.streamingTestUtil.setEnv(); - } + @org.junit.jupiter.api.BeforeEach + void setUp() { + this.streamingTestUtil.setUp(); + } - @org.junit.jupiter.api.BeforeEach - void setUp() { - this.streamingTestUtil.setUp(); - } + @org.junit.jupiter.api.AfterEach + void tearDown() { + this.streamingTestUtil.tearDown(); + } - @org.junit.jupiter.api.AfterEach - void tearDown() { - this.streamingTestUtil.tearDown(); - } + // ---------------------------------------- + // Tests + // ---------------------------------------- + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void parseWhereTest() throws Exception { + String q, e, result, uuid; + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as count by _time | where count > 70"; + long indexEarliestEpoch = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:40"); + e = "SELECT * FROM ( SELECT _time,count(_raw) AS count FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch + ") GROUP BY _time ) WHERE count > 70"; + result = utils.getQueryAnalysis(q); + /* + LOGGER.info("SQL ="+result); + LOGGER.info("EXP ="+e); + */ + assertEquals(e, result, q); + } - // ---------------------------------------- - // Tests - // ---------------------------------------- + /** + * + * + * + * + * --------------------------- + * + * + * + * + * scala-sample create dataframe + * (Result of search-transform when root=true) val df = spark.readStream.load().option("query","") + * process that ( processing resulting dataframe) val resultingDataSet = + * df.groupBy(col("`_time`")).agg(functions.count(col("`_raw`")).as("`count`")).where(col("`_raw`").gt(70)); Same + * using single spark.readStream.load().option("query","").groupBy(col("`_time`")).agg(functions.count(col("`_raw`")).as("`count`")).where(col("`_raw`").gt(70)); + * when using treewalker, add "`"-around column names count -> `count` + */ + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalOrWhereTest() throws Exception { + String q, e, result; + // test where-clause with logical operation OR + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where cnt > 70 OR cnt < 75"; + long indexEarliestEpoch5 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch5 + ") GROUP BY _time ) WHERE cnt > 70 OR cnt < 75"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result, q); + } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void parseWhereTest() throws Exception { - String q, e, result, uuid; - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as count by _time | where count > 70"; - long indexEarliestEpoch = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:40"); - e = "SELECT * FROM ( SELECT _time,count(_raw) AS count FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch+") GROUP BY _time ) WHERE count > 70"; - result = utils.getQueryAnalysis(q); - /* - LOGGER.info("SQL ="+result); - LOGGER.info("EXP ="+e); - */ - assertEquals(e,result, q); - } + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalAndWhereTest() throws Exception { + String q, e, result; + // test where-clause with logical operation AND + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where cnt > 70 AND cnt < 75"; + long indexEarliestEpoch4 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch4 + ") GROUP BY _time ) WHERE cnt > 70 AND cnt < 75"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result, q); + } - /** - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * --------------------------- - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * scala-sample - * create dataframe (Result of search-transform when root=true) - * val df = spark.readStream.load().option("query","") - * process that ( processing resulting dataframe) - * val resultingDataSet = df.groupBy(col("`_time`")).agg(functions.count(col("`_raw`")).as("`count`")).where(col("`_raw`").gt(70)); - * - * Same using single - * spark.readStream.load().option("query","").groupBy(col("`_time`")).agg(functions.count(col("`_raw`")).as("`count`")).where(col("`_raw`").gt(70)); - * when using treewalker, add "`"-around column names count -> `count` - * - */ + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalAndOrWhereTest() throws Exception { + String q, e, result; + // test where-clause with logical operation AND, OR + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where cnt > 70 AND cnt < 75 OR cnt != 72"; + long indexEarliestEpoch6 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch6 + ") GROUP BY _time ) WHERE cnt > 70 AND cnt < 75 OR cnt != 72"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result, q); + } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalOrWhereTest() throws Exception { - String q, e, result; - // test where-clause with logical operation OR - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where cnt > 70 OR cnt < 75"; - long indexEarliestEpoch5 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch5+") GROUP BY _time ) WHERE cnt > 70 OR cnt < 75"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result, q); - } + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void chainedTransformWhereTest() throws Exception { + String q, e, result; + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where cnt > 70 | where cnt < 75"; + long indexEarliestEpoch2 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = "SELECT * FROM ( SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch2 + ") GROUP BY _time ) WHERE cnt > 70 ) WHERE cnt < 75"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result, q); + } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalAndWhereTest() throws Exception { - String q, e, result; - // test where-clause with logical operation AND - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where cnt > 70 AND cnt < 75"; - long indexEarliestEpoch4 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch4+") GROUP BY _time ) WHERE cnt > 70 AND cnt < 75"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result, q); - } + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void logicalWhereTest() throws Exception { + String q, e, result; + // test where-clause with logical operation AND, OR with parents + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 )"; + long indexEarliestEpoch8 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch8 + + ") GROUP BY _time ) WHERE ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 )"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result, q); + } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalAndOrWhereTest() throws Exception { - String q, e, result; - // test where-clause with logical operation AND, OR - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where cnt > 70 AND cnt < 75 OR cnt != 72"; - long indexEarliestEpoch6 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch6+") GROUP BY _time ) WHERE cnt > 70 AND cnt < 75 OR cnt != 72"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result, q); - } + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void complexWhereTest() throws Exception { + String q, e, result; + // test where-clause with logical operation AND, OR with parents and several + // logical operation + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 AND cnt !=35 OR cnt = 65)"; + long indexEarliestEpoch = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:40"); + e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch + + ") GROUP BY _time ) WHERE ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 AND cnt != 35 OR cnt = 65 )"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result, q); + } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void chainedTransformWhereTest() throws Exception { - String q, e, result; - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where cnt > 70 | where cnt < 75"; - long indexEarliestEpoch2 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = "SELECT * FROM ( SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch2+") GROUP BY _time ) WHERE cnt > 70 ) WHERE cnt < 75"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result, q); - } + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void complexWhereXmlTest() throws Exception { + String q, e, result; + // test where-clause with logical operation AND, OR with parents and several + // logical operation + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 AND cnt !=35 OR cnt = 65)"; + long indexEarliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = ""; + result = utils.getQueryAnalysis(q); + assertEquals(e, result, q); + } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void logicalWhereTest() throws Exception { - String q, e, result; - // test where-clause with logical operation AND, OR with parents - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 )"; - long indexEarliestEpoch8 = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch8+") GROUP BY _time ) WHERE ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 )"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result, q); - } + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void symbolTableTest() throws Exception { + String q, e, result; + // test symbol-table, count(raw)->replaced with generated row in form + // __count_UUID + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) by _time | where \"count(_raw)\" > 70 | where \"count(_raw)\" < 75"; + long indexEarliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = "SELECT * FROM ( SELECT * FROM ( SELECT _time,count(_raw) AS __count_UUID FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch + ") GROUP BY _time ) WHERE __count_UUID > 70 ) WHERE __count_UUID < 75"; + result = utils.getQueryAnalysis(q); + // find generated fieldname from result and check that it is like __count_UUID + String r[] = result.split("__count"); + String uuid = r[1].substring(1, 37); + if (utils.isUUID(uuid)) { + // Was generated row-name so accept that as expected one + e = e.replace("__count_UUID", "__count_" + uuid); + } + assertEquals(e, result, q); + } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void complexWhereTest() throws Exception { - String q, e, result; - // test where-clause with logical operation AND, OR with parents and several - // logical operation - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 AND cnt !=35 OR cnt = 65)"; - long indexEarliestEpoch = new DefaultTimeFormat().getEpoch("04/16/2020:10:25:40"); - e = "SELECT * FROM ( SELECT _time,count(_raw) AS cnt FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch+") GROUP BY _time ) WHERE ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 AND cnt != 35 OR cnt = 65 )"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result, q); - } + @Disabled + @Test // disabled on 2022-05-16 TODO Convert to dataframe test + public void defaultCountWithLogicalOperationsTest() throws Exception { + String q, e, result, uuid; + // test where-clause with logical operation AND, OR and testing symbol-table + q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) by _time | where 'count(_raw)' > 71 AND 'count(_raw)' < 75 OR 'count(_raw)' != 72"; + long indexEarliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); + e = "SELECT * FROM ( SELECT _time,count(_raw) AS `count(_raw)` FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime(" + + indexEarliestEpoch + + ") GROUP BY _time ) WHERE 'count(_raw)' > 71 AND 'count(_raw)' < 75 OR 'count(_raw)' != 72"; + result = utils.getQueryAnalysis(q); + assertEquals(e, result, q); + } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void complexWhereXmlTest() throws Exception { - String q, e, result; - // test where-clause with logical operation AND, OR with parents and several - // logical operation - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) as cnt by _time | where ( cnt > 70 AND cnt < 75 ) OR ( cnt > 30 AND cnt < 40 AND cnt !=35 OR cnt = 65)"; - long indexEarliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = ""; - result = utils.getQueryAnalysis(q); - assertEquals(e,result, q); - } + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void whereTestIntegerColumnLessThan() { + streamingTestUtil.performDPLTest("index=index_A | where offset < 3", testFile, ds -> { + assertEquals( + "[_time, id, _raw, index, sourcetype, host, source, partition, offset]", + Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void symbolTableTest() throws Exception { - String q, e, result; - // test symbol-table, count(raw)->replaced with generated row in form - // __count_UUID - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) by _time | where \"count(_raw)\" > 70 | where \"count(_raw)\" < 75"; - long indexEarliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = "SELECT * FROM ( SELECT * FROM ( SELECT _time,count(_raw) AS __count_UUID FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch+") GROUP BY _time ) WHERE __count_UUID > 70 ) WHERE __count_UUID < 75"; - result = utils.getQueryAnalysis(q); - // find generated fieldname from result and check that it is like __count_UUID - String r[] = result.split("__count"); - String uuid = r[1].substring(1, 37); - if (utils.isUUID(uuid)) { - // Was generated row-name so accept that as expected one - e = e.replace("__count_UUID", "__count_" + uuid); - } - assertEquals(e,result, q); - } + assertEquals(2, ds.collectAsList().size()); + }); + } - @Disabled - @Test // disabled on 2022-05-16 TODO Convert to dataframe test - public void defaultCountWithLogicalOperationsTest() throws Exception { - String q, e, result,uuid; - // test where-clause with logical operation AND, OR and testing symbol-table - q = "index = cinnamon _index_earliest=\"04/16/2020:10:25:40\" | chart count(_raw) by _time | where 'count(_raw)' > 71 AND 'count(_raw)' < 75 OR 'count(_raw)' != 72"; - long indexEarliestEpoch = new DPLTimeFormat("MM/dd/yyyy:HH:mm:ss").getEpoch("04/16/2020:10:25:40"); - e = "SELECT * FROM ( SELECT _time,count(_raw) AS `count(_raw)` FROM `temporaryDPLView` WHERE index LIKE \"cinnamon\" AND _time >= from_unixtime("+indexEarliestEpoch+") GROUP BY _time ) WHERE 'count(_raw)' > 71 AND 'count(_raw)' < 75 OR 'count(_raw)' != 72"; - result = utils.getQueryAnalysis(q); - assertEquals(e,result, q); - } + @Test + @DisabledIfSystemProperty( + named = "skipSparkTest", + matches = "true" + ) + public void whereTestIntegerColumnLessThanAfterChart() { + streamingTestUtil + .performDPLTest( + "index=index_A " + "| chart avg(offset) as aoffset" + "| chart values(aoffset) as voffset" + + "| chart sum(voffset) as soffset" + "| where soffset > 3", + testFile, ds -> { + assertEquals( + "[soffset]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !" + ); - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void whereTestIntegerColumnLessThan() { - streamingTestUtil.performDPLTest( - "index=index_A | where offset < 3", - testFile, - ds -> { - assertEquals("[_time, id, _raw, index, sourcetype, host, source, partition, offset]", Arrays.toString(ds.columns()), - "Batch handler dataset contained an unexpected column arrangement !"); - - assertEquals(2, ds.collectAsList().size()); - } - ); - } - - @Test - @DisabledIfSystemProperty(named="skipSparkTest", matches="true") - public void whereTestIntegerColumnLessThanAfterChart() { - streamingTestUtil.performDPLTest( - "index=index_A " + - "| chart avg(offset) as aoffset" + - "| chart values(aoffset) as voffset" + - "| chart sum(voffset) as soffset" + - "| where soffset > 3", - testFile, - ds -> { - assertEquals("[soffset]", Arrays.toString(ds.columns()), "Batch handler dataset contained an unexpected column arrangement !"); - - assertEquals(1, ds.collectAsList().size()); - } - ); - } + assertEquals(1, ds.collectAsList().size()); + } + ); + } }