Skip to content

Commit

Permalink
Merge pull request #80 from DougSchmidt-AI/feature/Issue-72-PointZill…
Browse files Browse the repository at this point in the history
…aRemoveDuplicateCsvPoints

Issue-72 - PointZilla now removes duplicate points from CSV files by default
  • Loading branch information
Doug Schmidt authored Sep 28, 2018
2 parents 602d982 + bdc58cd commit f64b758
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 14 deletions.
1 change: 1 addition & 0 deletions TimeSeries/PublicApis/SdkExamples/PointZilla/Context.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,6 @@ public class Context
public string CsvTimeFormat { get; set; }
public bool CsvIgnoreInvalidRows { get; set; }
public bool CsvRealign { get; set; }
public bool CsvRemoveDuplicatePoints { get; set; } = true;
}
}
30 changes: 30 additions & 0 deletions TimeSeries/PublicApis/SdkExamples/PointZilla/CsvReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,36 @@ private List<ReflectedTimeSeriesPoint> LoadPoints(string path)
points.Add(point);
}

if (Context.CsvRemoveDuplicatePoints)
{
points = points
.OrderBy(p => p.Time)
.ToList();

var duplicatePointCount = 0;

for (var i = 1; i < points.Count; ++i)
{
var prevPoint = points[i - 1];
var point = points[i];

if (point.Time != prevPoint.Time)
continue;

++duplicatePointCount;

Log.Warn($"Discarding duplicate CSV point at {point.Time} with value {point.Value}");
points.RemoveAt(i);

--i;
}

if (duplicatePointCount > 0)
{
Log.Warn($"Removed {duplicatePointCount} duplicate CSV points.");
}
}

if (Context.CsvRealign)
{
points = points
Expand Down
2 changes: 1 addition & 1 deletion TimeSeries/PublicApis/SdkExamples/PointZilla/Option.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public string UsageText()
return $"{key} {Description}{defaultValue}";
}

private const int KeyWidth = 22;
private const int KeyWidth = 24;
private static readonly string SeparatorLine = string.Empty.PadRight(KeyWidth + 1, '=');
}
}
31 changes: 22 additions & 9 deletions TimeSeries/PublicApis/SdkExamples/PointZilla/PointsAppender.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,21 +62,34 @@ public void AppendPoints()

var numberOfPointsAppended = 0;
var numberOfPointsDeleted = 0;
var batchCount = 0;
var stopwatch = Stopwatch.StartNew();

foreach (var batch in GetPointBatches())
var pointBatches = GetPointBatches(Points).ToList();
var isBatched = pointBatches.Count > 1;
var batchIndex = 1;

foreach (var batch in pointBatches)
{
var result = AppendPointBatch(client, timeSeries, batch.Item1, batch.Item2, isReflected, hasTimeRange);
if (isBatched)
{
var batchSummary =
$"Appending batch #{batchIndex}: {batch.Points.Count} points [{batch.Points.First().Time} to {batch.Points.Last().Time}]";

Log.Info( hasTimeRange
? $"{batchSummary} within TimeRange={batch.TimeRange} ..."
: $"{batchSummary} ...");
}

var result = AppendPointBatch(client, timeSeries, batch.Points, batch.TimeRange, isReflected, hasTimeRange);
numberOfPointsAppended += result.NumberOfPointsAppended;
numberOfPointsDeleted += result.NumberOfPointsDeleted;
batchCount++;
++batchIndex;

if (result.AppendStatus != AppendStatusCode.Completed)
throw new ExpectedException($"Unexpected append status={result.AppendStatus}");
}

var batchText = batchCount > 1 ? $" using {batchCount} appends" : "";
var batchText = isBatched ? $" using {pointBatches.Count} appends" : "";
Log.Info($"Appended {numberOfPointsAppended} points (deleting {numberOfPointsDeleted} points) in {stopwatch.ElapsedMilliseconds / 1000.0:F1} seconds{batchText}.");
}
}
Expand Down Expand Up @@ -131,9 +144,9 @@ private TimeSeriesAppendStatus AppendPointBatch(IAquariusClient client, TimeSeri
Context.AppendTimeout);
}

private IEnumerable<Tuple<List<ReflectedTimeSeriesPoint>, Interval>> GetPointBatches()
private IEnumerable<(List<ReflectedTimeSeriesPoint> Points, Interval TimeRange)> GetPointBatches(
List<ReflectedTimeSeriesPoint> points)
{
var points = GetPoints();
var remainingTimeRange = GetTimeRange();

var index = 0;
Expand All @@ -143,12 +156,12 @@ private IEnumerable<Tuple<List<ReflectedTimeSeriesPoint>, Interval>> GetPointBat
var batchTimeRange = new Interval(remainingTimeRange.Start, batchPoints.Last().Time.GetValueOrDefault().PlusTicks(1));
remainingTimeRange = new Interval(batchTimeRange.End, remainingTimeRange.End);

yield return new Tuple<List<ReflectedTimeSeriesPoint>, Interval>(batchPoints, batchTimeRange);
yield return (batchPoints, batchTimeRange);

index += Context.BatchSize;
}

yield return new Tuple<List<ReflectedTimeSeriesPoint>, Interval>(points.Skip(index).ToList(), remainingTimeRange);
yield return (points.Skip(index).ToList(), remainingTimeRange);
}

private List<ReflectedTimeSeriesPoint> GetPoints()
Expand Down
1 change: 1 addition & 0 deletions TimeSeries/PublicApis/SdkExamples/PointZilla/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ private static Context ParseArgs(string[] args)
new Option {Key = nameof(context.CsvSkipRows), Setter = value => context.CsvSkipRows = int.Parse(value), Getter = () => context.CsvSkipRows.ToString(), Description = "Number of CSV rows to skip before parsing"},
new Option {Key = nameof(context.CsvIgnoreInvalidRows), Setter = value => context.CsvIgnoreInvalidRows = bool.Parse(value), Getter = () => context.CsvIgnoreInvalidRows.ToString(), Description = "Ignore CSV rows that can't be parsed"},
new Option {Key = nameof(context.CsvRealign), Setter = value => context.CsvRealign = bool.Parse(value), Getter = () => context.CsvRealign.ToString(), Description = $"Realign imported CSV points to the /{nameof(context.StartTime)} value"},
new Option {Key = nameof(context.CsvRemoveDuplicatePoints), Setter = value => context.CsvRemoveDuplicatePoints = bool.Parse(value), Getter = () => context.CsvRemoveDuplicatePoints.ToString(), Description = "Remove duplicate points in the CSV before appending."},
new Option {Key = "CsvFormat", Description = "Shortcut for known CSV formats. One of 'NG' or '3X'. [default: NG]", Setter =
value =>
{
Expand Down
66 changes: 62 additions & 4 deletions TimeSeries/PublicApis/SdkExamples/PointZilla/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,6 @@ The `-csvFormat=` option supports two prefconfigured formats:
- `-csvFormat=NG` is equivalent to `-csvTimeField=1 -csvValueField=3 -csvGradeField=5 -csvQualifiersField=6 -csvSkipRows=0 -csvComment="#"`
- `-csvFormat=3X` is equivalent to `-csvTimeField=1 -csvValueField=2 -csvGradeField=3 -csvQualifiersField=0 -csvSkipRows=2 -csvTimeFormat="MM/dd/yyyy HH:mm:ss"`




```sh
$ ./PointZilla.exe -server=myserver Stage.Label@MyLocation Downloads/Stage.Historical@A001002.EntireRecord.csv

Expand Down Expand Up @@ -208,5 +205,66 @@ $ ./PointZilla.exe -server=myserver Stage.Label@Location -TimeRange=2018-04-25T0

Like `curl`, the `PointZilla` tool has dozens of command line options, which can be a bit overwhelming. Fortunately, you'll rarely need to use all the options at once.

Try the `/Help` option to see the entire list of supported options.
Try the `/Help` option to see the entire list of supported options and read the [wiki for the @optionsFile syntax](https://github.com/AquaticInformatics/examples/wiki/Common-command-line-options).

```
Append points to an AQTS time-series.
usage: PointZilla [-option=value] [@optionsFile] [command] [identifierOrGuid] [value] [csvFile] ...
Supported -option=value settings (/option=value works too):
-Server AQTS server name
-Username AQTS username [default: admin]
-Password AQTS password [default: admin]
-Wait Wait for the append request to complete [default: True]
-AppendTimeout Timeout period for append completion, in .NET TimeSpan format.
-BatchSize Maximum number of points to send in a single append request [default: 500000]
========================= Time-series options:
-TimeSeries Target time-series identifier or unique ID
-TimeRange Time-range for overwrite in ISO8061/ISO8601 (defaults to start/end points)
-Command Append operation to perform. One of Auto, Append, OverwriteAppend, Reflected, DeleteAllPoints. [default: Auto]
-GradeCode Optional grade code for all appended points
-Qualifiers Optional qualifier list for all appended points
-CreateMode Mode for creating missing time-series. One of Never, Basic, Reflected. [default: Never]
-GapTolerance Set the gap tolerance for newly-created time-series. [default: "MaxDuration"]
-UtcOffset Set the UTC offset for any created location. [default: Use system timezone]
========================= Copy points from another time-series:
-SourceTimeSeries Source time-series to copy. Prefix with [server2] or [server2:username2:password2] to copy from another server
-SourceQueryFrom Start time of extracted points in ISO8601 format.
-SourceQueryTo End time of extracted points
========================= Point-generator options:
-StartTime Start time of generated points, in ISO8601 format. [default: the current time]
-PointInterval Interval between generated points, in .NET TimeSpan format. [default: 00:01:00]
-NumberOfPoints Number of points to generate. If 0, use NumberOfPeriods [default: 0]
-NumberOfPeriods Number of waveform periods to generate. [default: 1]
-WaveformType Waveform to generate. One of Linear, SawTooth, SineWave, SquareWave. [default: SineWave]
-WaveformOffset Offset the generated waveform by this constant. [default: 0]
-WaveformPhase Phase within one waveform period [default: 0]
-WaveformScalar Scale the waveform by this amount [default: 1]
-WaveformPeriod Waveform period before repeating [default: 1440]
========================= CSV parsing options:
-CSV Parse the CSV file
-CsvTimeField CSV column index for timestamps [default: 1]
-CsvValueField CSV column index for values [default: 3]
-CsvGradeField CSV column index for grade codes [default: 5]
-CsvQualifiersField CSV column index for qualifiers [default: 6]
-CsvTimeFormat Format of CSV time fields (defaults to ISO8601)
-CsvComment CSV comment lines begin with this prefix [default: #]
-CsvSkipRows Number of CSV rows to skip before parsing [default: 0]
-CsvIgnoreInvalidRows Ignore CSV rows that can't be parsed [default: True]
-CsvRealign Realign imported CSV points to the /StartTime value [default: False]
-CsvRemoveDuplicatePoints Remove duplicate points in the CSV before appending. [default: True]
-CsvFormat Shortcut for known CSV formats. One of 'NG' or '3X'. [default: NG]
Use the @optionsFile syntax to read more options from a file.
Each line in the file is treated as a command line option.
Blank lines and leading/trailing whitespace is ignored.
Comment lines begin with a # or // marker.
```

0 comments on commit f64b758

Please sign in to comment.