Data filling using neighbor statistics in Python

Dr. Huidae Cho

1   USGS discharge data

USGS discharge data

usgs_discharge_data.txt

2   Python code

import random

# read in data
data_path = r"C:\tmp\usgs_discharge_data.txt"
f = open(data_path)
skip_lines = 2
line_count = 0
dischs = []
for line in f:
    # if line starts with a hash, skip this line
    if line.startswith("#"):
        continue
    
    line_count = line_count + 1
    if line_count <= skip_lines:
        continue
    cols = line.split("\t")
    disch = float(cols[4])
    dischs.append(disch)
f.close()

# error specifications
specs = [{"flow_rate": 300, "error_rate": 1}, {"flow_rate": 800, "error_rate": 10}, {"flow_rate": 0, "error_rate": 15}]
specs2 = {"flow_rates": [300, 800, 0], "error_rates": [1, 10, 15]}

# generate random errors
num_days = len(dischs)
flow_rates = specs2["flow_rates"]
error_rates = specs2["error_rates"]
num_specs = len(flow_rates)

for i in range(num_days):
    disch = dischs[i] # take the discharge for day i
    for j in range(num_specs):
        error_rate = error_rates[j]

        if j == 0:
            lower_flow = -100 # no lower limit
            upper_flow = flow_rates[j]
        elif j == num_specs - 1:
            lower_flow = flow_rates[j-1]
            upper_flow = -100 # meaning no upper limit
        else:
            lower_flow = flow_rates[j-1]
            upper_flow = flow_rates[j]

        prob = random.random() * 100
        # prob: 0 ........... 100
        # rate:    1
        
        # if (disch is less than 300 cfs) or (disch is greater than 800 cfs) or (disch is between 300 cfs and 800 cfs) ==> [0, infinity)
        if (lower_flow < 0 and disch <= upper_flow) or (upper_flow < 0 and disch > lower_flow) or (lower_flow < disch <= upper_flow):
            if prob <= error_rate:
                # introduce an error
                dischs[i] = -100
            break            

dischs_errors = dischs.copy()

# fill gaps using the average of previous and next days
for i in range(num_days):
    disch = dischs[i] # take the discharge for day i
    if disch < 0: # we found an error
        # 1. take the previous and next days' discharges
        disch_prev = -100 # -100 will indicate that we haven't found the previous day's discharge
        disch_next = -100 # -100 will indicate that we haven't found the next day's discharge
        if i > 0:
            disch_prev = dischs[i-1]
        # find the first coming day without an error
        for j in range(i + 1, num_days):
            if dischs[j] >= 0:
                disch_next = dischs[j]
                break
        if disch_prev < 0: # if there was no previous discharge found ==> day 0
            if disch_next < 0: # if there was no next discharge found ==> all the days after day i had an error
                # case 1: -100, -100, ..., -100
                # all the days in your data have an error, we cannot do anything because everything is an error
                # stop the entire process
                break
            else: # we found a next discharge
                # case 2: -100, -100, 50, ...
                # just use 50 to fill the gaps because there is no lower bound
                # fixed: 50, 50, 50, ...
                dischs[i] = disch_next
        else: # there was a previous discharge
            if disch_next < 0: # if there was no next discharge found ==> all the days after day i had an error
                # case 3: 50, -100, -100, ..., -100
                # fixed: 50, 50, 50, ..., 50
                dischs[i] = disch_prev
            else: # have both previous and next discharges
                #              i=1        j=3
                # case 4: 50, -100, -100, 100, ...
                # fixed: 50, 50 + (100 - 50) / 3 = 66.17, 82.84, 100, ...
                dischs[i] = disch_prev + (disch_next - disch_prev) / (j - i + 1)