Panda Bears and Pythons

I’ve seen import pandaas and and numpy around the traps and thought ‘geez, that’ll be a long install. Today I saw why it’s gooda.

Some things I learnt:

  • Python on windows is crap. Try conda or linux
  • Read the excellent docs
  • Create test data to understand your problem
  • Review read_csv() result to see of Pandas go the timestamp parsed correctly. Otherwise if it thinks it’s a string or object, you can’t work with timeseries data.
  • Set the index with set_index()
  • Once in a DataFrame, resampling and merging is easy!

# import data with weird date formats and timestamp from many csv files.py
import pandas as pd
import os

bin = "1h"

def weird_dates(date_str):
    for fmt in ("%d/%m/%Y %H:%M", "%d/%m/%Y %I:%M %p"):  # %I for 12-hour clock, %p for AM/PM
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            pass
    raise ValueError("no valid date format found")

# Function to read CSV files and collate data
def collate_data(csv_files):
    # Initialize an empty DataFrame
    collated_data = None
    failures = []
    
    # Iterate through each CSV file
    for file in csv_files:
        try:    
            print(f"Reading {file}")
            df = pd.read_csv(file,skiprows=2,parse_dates=['Timestamp'],date_parser=weird_dates)
            df=df.set_index('Timestamp')
            df.info()
            print(f"Resampling to {bin}")
            pretty_df = df.resample(bin).mean()
            pretty_df = pretty_df.round(decimals=1)
            pretty_df.info()
            if collated_data is None:
                collated_data = pretty_df
            else:
                collated_data = pd.merge(collated_data, pretty_df, on='Timestamp', how='outer')
        except Exception as exc:
            print(exc)
            failures.append(file)
    
    collated_data = collated_data.sort_values(by="Timestamp")
    print(f"Yeah, so these failed: {failures}")
    return collated_data

# Seperate into csv files by column name
def to_csv_selected(df,search_strings):
    for search_string in search_strings:
        matching_columns = [col for col in df.columns if search_string.lower() in col.lower()]
        print(f"{search_string} matched {matching_columns}")
        df[matching_columns].to_csv(f"{search_string}_{bin}.csv",)


def main():
    # Directory containing CSV files
    csv_directory = 'alldata'
    
    
    # List CSV files in the directory
    csv_files = [ f for fin os.listdir(csv_directory) if file.endswith('.csv')]
    
    collated_data = collate_data(csv_files)
    collated_data.to_csv(f"all_data_{bin}.csv",)

    to_csv_selected(collated_data,["Temperature","Humidity","Dew"])

    
if __name__ == "__main__":
    main()

Leave a Reply

Your email address will not be published. Required fields are marked *