from pandas import read_csv
import numpy as np
import pandas as pd
global summmm
# #显示所有列
# pd.set_option('display.max_columns', None)
# #显示所有行
# pd.set_option('display.max_rows', None)
# #设置value的显示长度为100,默认为50
# pd.set_option('max_colwidth',100)
filename = "/root/data1.csv"
data = read_csv(filename, index_col=0, encoding='UTF-8')
# 清空全是空值的行
data.dropna(axis=0, how='all', inplace=True)
# —符号用NaN代替
data = data.replace('—', np.nan)
# 加索引
data = data.reset_index()
# new_index = pd.RangeIndex(len(data))
# data.insert(0,'index',new_index)
# data = data.reset_index('index')
# print(data)
class Counter:
count = 0
def fill_nans(df, col_index):
col = df.columns[col_index]
col_data = df[col].copy()
for i in range(1, len(col_data) - 1):
if pd.isnull(col_data[i]):
nans = 1
j = i + 1
while j < len(col_data) and pd.isnull(col_data[j]):
nans += 1
j += 1
if nans == 1:
prev = col_data[i - 1] if i > 0 else 0
# next = col_data[i + 1] if i < len(col_data) - 1 else 0
col_data[i] = prev
# print(f"在第{i}行填充值{col_data[i]}")
Counter.count += 1
elif nans == 2:
prev = col_data[i - 1] if i > 0 else 0
col_data[i] = prev
next = col_data[i + 2] if i < len(col_data) - 2 else 0
col_data[i + 1] = next
# print(f"在第{i}行填充值{prev}")
# print(f"在第{i + 1}行填充值{next}")
Counter.count += 2
else:
raise ValueError(f"超过两个连续NaN至第{i + nans}行")
df[col] = col_data
return df
def drop_consecutive_nans(df, col_idx, threshold=3):
col_name = df.columns[col_idx]
nan_rows = []
nan_count = 0
for i, v in enumerate(df[col_name]):
if pd.isna(v):
nan_count += 1
else:
if nan_count >= threshold:
start = i - nan_count
end = i - 1
# print(f"找到{nan_count}个连续NaN,起始行:{start},截止行:{end}")
nan_rows.extend(list(range(i - nan_count, i)))
Counter.count += nan_count
nan_count = 0
if nan_count >= threshold:
nan_rows.extend(list(range(len(df) - nan_count, len(df))))
df.drop(index=nan_rows, inplace=True)
df = df.reset_index(drop=True)
return df
def process_col(df, col_idx):
col_name = df.columns[col_idx]
col_data = df[col_name].astype(float)
# print(col_data)
for i in range(1, len(col_data) - 1):
if col_data[i] < 0 and col_data[i - 1] == 0 and col_data[i + 1] == 0:
# print(i,'行的数据')
Counter.count += 1
df.at[i, col_name] = 0
return df
# process_col(data, 1)
for i in range(13):
# print(i)
if i == 0 or i == 7:
continue
data = process_col(data, i)
print('找到了:', Counter.count, '个数据可能时测量0值时出现错误,并补0')
Counter.count = 0
for i in range(14):
# print(i)
# if i == 0 or i == 7:
# continue
data = drop_consecutive_nans(data, i)
# data = data.reset_index()
print('找到了:', Counter.count, '行数据缺失,并删除')
Counter.count = 0
for i in range(14):
# print(i)
if i <= 1:
continue
data = fill_nans(data, i)
# data = data.reset_index()
print('补全了:', Counter.count, '个数据')
Counter.count = 0
print('now:')
# print(data.index)
# print(data.columns)
# print(data.shape)dd
print(data)
outputpath='/root/res.csv'
data.to_csv(outputpath,sep=',',index=True,header=True,encoding='utf-8-sig')
# print('ji')