-
Notifications
You must be signed in to change notification settings - Fork 0
/
transform-genbank-location
executable file
·43 lines (31 loc) · 1.25 KB
/
transform-genbank-location
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env python3
"""
Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate
fields: 'country', 'division', and 'location'. Checks that a record is from
GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq".
Outputs the modified record to stdout.
"""
import json
from sys import stdin, stdout
def parse_location(record: dict) -> dict:
# Expected pattern for the location field is "<country_value>[:<region>][, <locality>]"
# See GenBank docs for their "country" field:
# https://www.ncbi.nlm.nih.gov/genbank/collab/country/
geographic_data = record['location'].split(':')
country = geographic_data[0]
division = ''
location = ''
if len(geographic_data) == 2:
division , _ , location = geographic_data[1].partition(',')
record['country'] = country.strip()
record['division'] = division.strip()
record['location'] = location.strip()
return record
if __name__ == '__main__':
for record in stdin:
record = json.loads(record)
database = record.get('database', '')
if database in {'GenBank', 'RefSeq'}:
parse_location(record)
json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
print()