-
Notifications
You must be signed in to change notification settings - Fork 0
/
c2_check_hcat_classification.py
136 lines (113 loc) · 6.1 KB
/
c2_check_hcat_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Author:
# github repository:
# 1. Loop over files and classify the crops and unify the column names.
# 2. Save a new version of the IACS data.
# ------------------------------------------ LOAD PACKAGES ---------------------------------------------------#
import os
from os.path import dirname, abspath
import time
import pandas as pd
import geopandas as gpd
import warnings
import numpy as np
import helper_functions
# ------------------------------------------ USER VARIABLES ------------------------------------------------#
# Get parent directory of current directory where script is located
WD = dirname(dirname(abspath(__file__)))
os.chdir(WD)
COL_NAMES_FOLDER = r"data\tables\column_names"
CROP_CLASSIFICATION_FOLDER = r"data\tables\crop_classifications"
# ------------------------------------------ DEFINE FUNCTIONS ------------------------------------------------#
def main():
stime = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
print("start: " + stime)
os.chdir(WD)
## Input for geodata harmonization (in some cases, e.g. France or Portugal,
## some csv file have also to be harmonized. See below)
## To turn off/on the processing of a specific country, just comment/uncomment the specific line
run_dict = {
"BE/FLA": {"region_id": "BE_FLA"},
"AT": {"region_id": "AT", "crop_class_pth": "AT_crop_classification_final_INSPIRE.xlsx"},
"DK": {"region_id": "DK"},
"SI": {"region_id": "SI"},
"NL": {"region_id": "NL"},
"FI": {"region_id": "FI"},
"LV": {"region_id": "LV"},
"SK": {"region_id": "SK"},
"FR/FR": {"region_id": "FR_FR"},
"FR/SUBREGIONS": {"region_id": "FR_SUBREGIONS"},
"PT/PT": {"region_id": "PT_PT"},
"PT/ALE": {"region_id": "PT_ALE"},
"PT/ALG": {"region_id": "PT_ALG"},
"PT/AML": {"region_id": "PT_AML"},
"PT/CE": {"region_id": "PT_CE"},
"PT/CEN": {"region_id": "PT_CEN"},
"PT/CES": {"region_id": "PT_CES"},
"PT/NO": {"region_id": "PT_NO"},
"PT/NON": {"region_id": "PT_NON"},
"PT/NOS": {"region_id": "PT_NOS"},
"HR": {"region_id": "HR"},
"SE": {"region_id": "SE"},
"BE/WAL": {"region_id": "BE_WAL"},
"DE/BB": {"region_id": "DE_BB"},
"CZ": {"region_id": "CZ"},
"RO": {"region_id": "RO"},
"DE/ST": {"region_id": "DE_ST"},
"DE/SL": {"region_id": "DE_SL"},
"CY/APPL": {"region_id": "CY_APPL"},
"ES": {"region_id": "ES"},
}
## For spain create a dictionary in a loop, because of the many subregions
ES_districts = pd.read_csv(r"data\vector\IACS\ES\region_code.txt")
ES_districts = list(ES_districts["code"])
run_dict = {f"ES/{district}": {
"region_id": f"ES_{district}",
"file_encoding": "utf-8",
"col_translate_pth": f"data/tables/column_name_translations/ES_column_name_translation.xlsx",
"crop_class_pth": "data/tables/crop_classifications/ES_crop_classification_final.xlsx",
"col_transl_descr_overwrite": "ES"
} for district in ES_districts}
## Loop over country codes in dict for processing
for country_code in run_dict:
hcat_correct_pth = r"data\tables\hcat_levels_v2\HCAT3_HCAT_mapping.csv"
hcat_errors_pth = r"data\tables\hcat_levels_v2\hcat_errors_by_kristoffer.xlsx"
## Derive input variables for function
region_id = run_dict[country_code]["region_id"] # country_code.replace(r"/", "_")
if "crop_class_pth" in run_dict[country_code]:
crop_class_pth = f"{CROP_CLASSIFICATION_FOLDER}/{run_dict[country_code]['crop_class_pth']}"
else:
crop_class_pth = f"{CROP_CLASSIFICATION_FOLDER}/{region_id}_crop_classification_final.xlsx"
## Read input
hcat_correct = pd.read_csv(hcat_correct_pth, dtype={"EC_hcat_c": str})
crop_class = pd.read_excel(crop_class_pth)
hcat_kris = pd.read_excel(hcat_errors_pth)
## Merge data frames
df_m = pd.merge(crop_class, hcat_correct[["HCAT3_name", "HCAT3_code"]],
"left", left_on="EC_hcat_c", right_on="HCAT3_code")
df_wrong = df_m.loc[df_m["EC_hcat_n"] != df_m["HCAT3_name"]].copy()
df_wrong["comment_on_mistake"] = "mismatch in names (check if there is another fitting class or if the class name has simply changed, there is also that genetically_modified_organism and energy_crops are swapped in our version"
df_m2 = pd.merge(crop_class, hcat_correct[["HCAT3_name", "HCAT3_code"]],
"left", left_on="EC_hcat_n", right_on="HCAT3_name")
df_wrong2 = df_m2.loc[df_m2["EC_hcat_c"] != df_m2["HCAT3_code"]].copy()
df_wrong2["comment_on_mistake"] = "mismatch in codes, this likely means that the assigned code is not correct anymore"
hcat_kris2 = hcat_kris.loc[hcat_kris["EC_hcat_c"].isna() | hcat_kris["EC_hcat_n"].isna()].copy()
hcat_kris2["crop_code"] = np.nan
hcat_kris2["crop_name"] = np.nan
hcat_kris2["crop_name_de"] = np.nan
hcat_kris2["crop_name_en"] = np.nan
hcat_kris2["EC_trans_n"] = np.nan
hcat_kris2 = hcat_kris2[['crop_code', 'crop_name', 'crop_name_de', 'crop_name_en', 'EC_trans_n',
'EC_hcat_n', 'EC_hcat_c', 'HCAT3_name', 'HCAT3_code', 'comment_on_mistake']]
# hcat_kris3 = hcat_kris.loc[(hcat_kris["HCAT3_name"].notna()) & (hcat_kris["EC_hcat_c"].notna()) & (hcat_kris["EC_hcat_n"].notna())].copy()
df_out = pd.concat([hcat_kris2, df_wrong, df_wrong2])
df_out.drop_duplicates(subset=["crop_code", "crop_name", "EC_hcat_n", "EC_hcat_c", "HCAT3_name"], inplace=True)
df_out.loc[df_out["HCAT3_code"].isna(), "comment_on_mistake"] = "class does not exist anymore in HCAT3 (maybe summer was reclassified to spring?)"
df_out.sort_values(by="crop_name", inplace=True)
out_pth = rf"{CROP_CLASSIFICATION_FOLDER}/{region_id}_crop_classification_wrong_entries.xlsx"
df_out.to_excel(out_pth, index=False)
etime = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
print("start: " + stime)
print("end: " + etime)
# POSTGRESQL Database
if __name__ == '__main__':
main()