#!/usr/bin/env python3
# -*- coding: utf-8 -*-

#%%
# The module pandas builds on top of data types introduced before and allows us to work with data frames.
import numpy as np
import pandas as pd

# Define a pandas DataFrame that includes monthly (6 months) info on sales, weather, and customers

# We start from defining a 'nparray': sales over 6 months -> 6 rows
sales = np.array([30, 40, 35, 130, 120, 60])

# Create a weather observation where '0' could represent 'bad' weather and '1' could represent 'good' weather
weather_coded = np.array([0, 1, 0, 1, 1, 0])

# Customer data: The number of customers per month during the same period
customers = np.array([2000, 2100, 1500, 8000, 7200, 2000])

# KEY: we combine the arrays into a pandas DataFrame -> remember this syntax? Check the dict one
df = pd.DataFrame({'sales': sales,
                   'weather_coded': weather_coded,
                   'customers': customers})

"""
- A DataFrame is a 2D labeled data structure provided by pandas, similar to a
 table in a database or Excel spreadsheet.
- It consists of rows and columns, where each column can hold different 
types of data (e.g., numbers, strings, booleans).
- Each row and column has labels (the index for rows and column names for columns).
This makes it ideal for storing, manipulating, and analyzing structured data.
"""

# Print the initial DataFrame
print(f'{df}')
# Let's improve the DataFrame by adding proper date indices instead of default numerical indices (0,1,2...)
month_index = pd.date_range(start='04/2024', freq='M', periods=6)
# 'freq=M' means monthly frequency, and we are generating 6 dates starting in April 2024
# We use date_range() because our data represents monthly sales over a 6-month period (fully balanced)

# Set the newly created dates as the index of the DataFrame
df.set_index(month_index, inplace=True)
# inplace=True means we are modifying the DataFrame directly 
df_new=df.set_index(month_index, inplace=False)
# we can also create a new one leaving df unchanged

# print the DataFrame
print(f'{df}')

#%% Different ways of 'slicing' the data frame

# Accessing specific columns by column names
subset1 = df[['sales', 'customers']]  # Extract a sub-DataFrame with 2 columns
print(f'{subset1}')
# Note: Double brackets are needed to extract multiple columns, as we are selecting a subset of the DataFrame

# Access a single column as a Series
subset2 = df['sales']  # This returns a Series (single column)
print(f'{subset2}')

# Access the same column using dot notation
subset3 = df.sales  # This is an alternative syntax (same result)
print(f'{subset3}')

# Accessing certain rows by slicing (row numbers 1 to 3)
subset4 = df[1:4]  # Extract rows 1 through 3 (Python uses 0-based indexing)
print(f'{subset4}')

# Accessing rows using date ranges (index-based slicing)
subset5 = df['2024-05-31':'2024-07-31']  # Select rows between May 2024 and July 2024 (inclusive)
print(f'{subset5}')

# Using .iloc[] to access rows and columns by their integer positions
subset6 = df.iloc[1:4, 0:2]  # Select rows 1-3 and columns 0-1 (iloc is based on integer positions)
print(f'{subset6}')

#%% Changing the data frame

# Print the full DataFrame again for reference
print(f'{df}')

# Create a shifted (lagged, t-2) version of the 'sales' column (shift data by 2 rows)
subset7 = df['sales'].shift(2)  # This shifts the sales data down by 2 rows
print(f'\n{subset7}\n')

# Adding a new column to the DataFrame: lagged sales (2 months ago)
df['sales_lag2'] = df['sales'].shift(2)  # This is adding the sales_lag2 col to df
print(f'df: \n{df}\n')

# Use a pandas.Categorical object to attach labels to the weather data (0 = bad, 1 = good)
df['weather'] = pd.Categorical.from_codes(codes=df['weather_coded'], categories=['bad', 'good'])
# Here we convert the 'weather_coded' column from numeric codes to categorical labels
print(f'df: \n{df}\n')
# Not sure how it works: try using GPT. "can you explain the syntax of this python script?'

#%% Other important pandas methods?

print(df.info())

print(df.describe())

print(df.isnull())  # Returns a DataFrame of the same shape with True where there are missing values
print(df.isnull().sum())  # Sums the number of missing values in each column

# NOTE: these commands do not modify the original df unless you set the inplace=True parameter.
# Replace all NaN values with 99: to do so 
# To do so, we need to add 0 as a valid category in the 'weather' column
df['weather'] = df['weather'].cat.add_categories([99])
# (to understand why, try to run the script commenting this code)
df_filled = df.fillna(99)  
print(df_filled)

df_dropped = df.dropna()  # Drops rows with any NaN values
print(df_dropped)

df_sorted = df.sort_values(by='sales', ascending=False)  # Sort by sales in descending order
print(df_sorted)

df_renamed = df.rename(columns={'sales': 'monthly_sales'})
print(df_renamed)

df_dropped = df.drop('sales_lag2', axis=1)  # Drops the column (axis=1) 'sales_lag2' (to drop a row: axis=0)
print(df_dropped)

df_dropped2 = df.drop('2024-06-30', axis=0)  # Drops the row with index '2024-06-30'
print(df_dropped2)

print(df['weather'].value_counts())  # Count occurrences of each weather category