I’ve seen import pandaas and and numpy
around the traps and thought ‘geez, that’ll be a long install. Today I saw why it’s gooda.
Some things I learnt:
- Python on windows is crap. Try conda or linux
- Read the excellent docs
- Create test data to understand your problem
- Review
read_csv
() result to see of Pandas go the timestamp parsed correctly. Otherwise if it thinks it’s a string or object, you can’t work with timeseries data. - Set the index with
set_index()
- Once in a DataFrame, resampling and merging is easy!
# import data with weird date formats and timestamp from many csv files.py
import pandas as pd
import os
bin = "1h"
def weird_dates(date_str):
for fmt in ("%d/%m/%Y %H:%M", "%d/%m/%Y %I:%M %p"): # %I for 12-hour clock, %p for AM/PM
try:
return pd.to_datetime(date_str, format=fmt)
except ValueError:
pass
raise ValueError("no valid date format found")
# Function to read CSV files and collate data
def collate_data(csv_files):
# Initialize an empty DataFrame
collated_data = None
failures = []
# Iterate through each CSV file
for file in csv_files:
try:
print(f"Reading {file}")
df = pd.read_csv(file,skiprows=2,parse_dates=['Timestamp'],date_parser=weird_dates)
df=df.set_index('Timestamp')
df.info()
print(f"Resampling to {bin}")
pretty_df = df.resample(bin).mean()
pretty_df = pretty_df.round(decimals=1)
pretty_df.info()
if collated_data is None:
collated_data = pretty_df
else:
collated_data = pd.merge(collated_data, pretty_df, on='Timestamp', how='outer')
except Exception as exc:
print(exc)
failures.append(file)
collated_data = collated_data.sort_values(by="Timestamp")
print(f"Yeah, so these failed: {failures}")
return collated_data
# Seperate into csv files by column name
def to_csv_selected(df,search_strings):
for search_string in search_strings:
matching_columns = [col for col in df.columns if search_string.lower() in col.lower()]
print(f"{search_string} matched {matching_columns}")
df[matching_columns].to_csv(f"{search_string}_{bin}.csv",)
def main():
# Directory containing CSV files
csv_directory = 'alldata'
# List CSV files in the directory
csv_files = [ f for fin os.listdir(csv_directory) if file.endswith('.csv')]
collated_data = collate_data(csv_files)
collated_data.to_csv(f"all_data_{bin}.csv",)
to_csv_selected(collated_data,["Temperature","Humidity","Dew"])
if __name__ == "__main__":
main()