我想清理我的csv文件中的某个列,当我将它们传递到pandas中时,它会为每行留下一个尾随\r。54147799,xxx_54147799,xxx_1442517,ALARM_NORMALIZED,xxx,xxx,2023-01-01 00:12:05.168,null,1,NORMAL,xxx/xxx/ATI,xxx,3,ATI System Mode,2,4,-1,-1\r
请帮助我改进我的脚本,使\r不出现。
我的csv文件实际上无法解析到Elasticsearch中,因为\r。第18列预计只接收数字,而不是alpha-numeria。
import pandas as pd
from pathlib import Path
import csv
import os
import re
def format_location(location):
if location and isinstance(location, str) and re.match(r'^\d+$', location): # Check if the cell contains only digits
if len(location) == 3:
return f"9{location}2" # Prepend '9' and append '2' to 3-digit numbers
elif len(location) == 5:
return location # Return the original 5-digit value
return location # Return the original value for non-numeric or incorrect length values
def drop_extra_column(input_directory, intermediate_directory):
# Check all files in the input directory
for file in input_directory.iterdir():
if file.suffix == '.csv':
try:
# Read CSV file using Pandas with error handling to warn and skip problematic lines
with open(file, 'r') as f: pd.read_csv(f)
df = pd.read_csv(f)
# Check if DataFrame has more than 18 columns
if len(df.columns) > 18:
# Drop the 19th column (index 18)
df.drop(df.columns[18], axis=1, inplace=True)
# Define the output file path
intermediate_path = intermediate_directory / f"{file.name}"
# Write back to the CSV file in the output directory using Pandas
df.to_csv(intermediate_path, index=False)
print(f"Processed data saved to '{intermediate_path}'")
else:
print(f"No extra column found in '{file.name}'. Skipping this file.")
except Exception as e:
print(f"Error processing file '{file.name}': {e}. Skipping this file.")
continue
def process_csv_files(intermediate_directory, output_directory):
# Check all files in the input directory
for file in intermediate_directory.iterdir():
if file.suffix == '.csv':
try:
# Read CSV file using Pandas
with open(file, 'r') as f: pd.read_csv(f)
df = pd.read_csv(f)
# Apply formatting to 'location' column only for numeric cells
if 'location' in df.columns:
df['location'] = df['location'].apply(format_location)
# Define the output file path
output_path = output_directory / f"{file.name}"
# Write back to the CSV file in the output directory using Pandas
df.to_csv(output_path, index=False)
print(f"Formatted data saved to '{output_path}'")
except Exception as e:
print(f"Error processing file '{file.name}': {e}. Skipping this file.")
continue
# Replace 'input/path/to/directory' and 'output/path/to/directory' with appropriate directory paths
input_csv_directory = Path("C:/Users/ethan/OneDrive/Documents/docker-env/logstash-standard/xxx/2023/01-Jan-23")
intermediate_csv_directory = Path("C:/Users/ethan/OneDrive/Documents/docker-env/logstash-standard/xxx/2023/Test")
output_csv_directory = Path("C:/Users/ethan/OneDrive/Documents/docker-env/logstash-standard/xxx/2023/Test")
# Process CSV files in the input directory and save processed files to the output directory
#drop_extra_column(input_csv_directory, intermediate_csv_directory)
# Process CSV files in the input directory and save processed files to the output directory
process_csv_files(input_csv_directory, output_csv_directory)
#process_csv_files(input_csv_directory, output_csv_directory)
字符串
24/11/23更新包含在with open('my.csv' , 'r') as f: pd.read_csv(f)
中
Error processing file 'events2023-01-24.csv': Error tokenizing data. C error: Expected 18 fields in line 6370, saw 19 . Skipping this file.
我的csv的第19列很少有数据,但显然100 k+行中的一些行仍然有数据。我不知道该怎么做,这个问题能解决吗?
1条答案
按热度按时间qf9go6mv1#
我也从Elastic的Angular 而不是Python解决了这个问题。
使用
字符串
}
在
logstash.conf
中捕获\r
并在json消息被摄取到管道中以作为Elasticsearch Index条目进行进一步剖析之前将其删除。对于那些使用Elasticsearch的人,如果你使用python,pandas和它们的读/写函数来清理数据,并获得不必要的回车,比如\r或\n,这会污染你的数据以便logstash摄取,那么你可能想学习我的解决方案。