Data filling using neighbor statistics in Python
Contents
1 USGS discharge data
2 Python code
import random
# read in data
data_path = r"C:\tmp\usgs_discharge_data.txt"
f = open(data_path)
skip_lines = 2
line_count = 0
dischs = []
for line in f:
# if line starts with a hash, skip this line
if line.startswith("#"):
continue
line_count = line_count + 1
if line_count <= skip_lines:
continue
cols = line.split("\t")
disch = float(cols[4])
dischs.append(disch)
f.close()
# error specifications
specs = [{"flow_rate": 300, "error_rate": 1}, {"flow_rate": 800, "error_rate": 10}, {"flow_rate": 0, "error_rate": 15}]
specs2 = {"flow_rates": [300, 800, 0], "error_rates": [1, 10, 15]}
# generate random errors
num_days = len(dischs)
flow_rates = specs2["flow_rates"]
error_rates = specs2["error_rates"]
num_specs = len(flow_rates)
for i in range(num_days):
disch = dischs[i] # take the discharge for day i
for j in range(num_specs):
error_rate = error_rates[j]
if j == 0:
lower_flow = -100 # no lower limit
upper_flow = flow_rates[j]
elif j == num_specs - 1:
lower_flow = flow_rates[j-1]
upper_flow = -100 # meaning no upper limit
else:
lower_flow = flow_rates[j-1]
upper_flow = flow_rates[j]
prob = random.random() * 100
# prob: 0 ........... 100
# rate: 1
# if (disch is less than 300 cfs) or (disch is greater than 800 cfs) or (disch is between 300 cfs and 800 cfs) ==> [0, infinity)
if (lower_flow < 0 and disch <= upper_flow) or (upper_flow < 0 and disch > lower_flow) or (lower_flow < disch <= upper_flow):
if prob <= error_rate:
# introduce an error
dischs[i] = -100
break
dischs_errors = dischs.copy()
# fill gaps using the average of previous and next days
for i in range(num_days):
disch = dischs[i] # take the discharge for day i
if disch < 0: # we found an error
# 1. take the previous and next days' discharges
disch_prev = -100 # -100 will indicate that we haven't found the previous day's discharge
disch_next = -100 # -100 will indicate that we haven't found the next day's discharge
if i > 0:
disch_prev = dischs[i-1]
# find the first coming day without an error
for j in range(i + 1, num_days):
if dischs[j] >= 0:
disch_next = dischs[j]
break
if disch_prev < 0: # if there was no previous discharge found ==> day 0
if disch_next < 0: # if there was no next discharge found ==> all the days after day i had an error
# case 1: -100, -100, ..., -100
# all the days in your data have an error, we cannot do anything because everything is an error
# stop the entire process
break
else: # we found a next discharge
# case 2: -100, -100, 50, ...
# just use 50 to fill the gaps because there is no lower bound
# fixed: 50, 50, 50, ...
dischs[i] = disch_next
else: # there was a previous discharge
if disch_next < 0: # if there was no next discharge found ==> all the days after day i had an error
# case 3: 50, -100, -100, ..., -100
# fixed: 50, 50, 50, ..., 50
dischs[i] = disch_prev
else: # have both previous and next discharges
# i=1 j=3
# case 4: 50, -100, -100, 100, ...
# fixed: 50, 50 + (100 - 50) / 3 = 66.17, 82.84, 100, ...
dischs[i] = disch_prev + (disch_next - disch_prev) / (j - i + 1)