{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "40ceb440",
   "metadata": {},
   "source": [
    "### What the script does\n",
    "\n",
    "- Opens all .NC files for a given day (MET, TPC, HPC, BLH).\n",
    "\n",
    "- Co- nverts sensor time to datetimes (epoch 2001-01-01).\n",
    "\n",
    "- Builds tidy tables:\n",
    "\n",
    "    - MET: surface pressure/temperature vs time.\n",
    "\n",
    "    - TPC: temperature profiles (T), computes saturation vapor pressure es(T).\n",
    "\n",
    "    - PC: relative humidity profiles (RH).\n",
    "\n",
    "    - Merges MET+TPC+HPC by (Time, Altitude).\n",
    "\n",
    "- Computes:\n",
    "\n",
    "    - Hydrostatic pressure profile with a trapezoidal integration from surface pressure.\n",
    "\n",
    "    - Vapor pressure ev = RH * es / 100.\n",
    "\n",
    "    - Specific humidity qv from ev and pressure.\n",
    "\n",
    "    - Virtual potential temperature θv (two ways: direct + alternative using θ).\n",
    "\n",
    "- Resamples profiles to 10-minute means and saves a Parquet dataset.\n",
    "\n",
    "- Makes several quicklook plots (profiles at selected times, comparison of θv methods, etc.).\n",
    "\n",
    "- Loads BLH product and plots BLH time series.\n",
    "\n",
    "- Estimates CBL height z_i from θv with:\n",
    "\n",
    "- Gradient method (max dθv/dz, smoothed).\n",
    "\n",
    "- Parcel method (first z where θv(z) > θv_surface).\n",
    "\n",
    "- Saves z_i time series to CSV.\n",
    "\n",
    "#### Lines / places you should change\n",
    "\n",
    "- Input folder (where the day’s .NC files live): folder_path = r\"C:\\path\\to\\your\\Microwave_radiometer\\2024-05\\2024-05-13\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26228cf0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import netCDF4 as nc\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import datetime as dt\n",
    "from datetime import datetime, timedelta\n",
    "from scipy.ndimage import gaussian_filter1d\n",
    "\n",
    "from matplotlib.backends.backend_pdf import PdfPages\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59f32ba2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Constants\n",
    "C_p = 1005  # Specific heat capacity of dry air at constant pressure (J/kg/K)\n",
    "g = 9.81    # Gravitational acceleration (m/s^2)\n",
    "Ttrip = 273.16  # Triple point temperature in Kelvin\n",
    "Rd=287.04\n",
    "Rv=461.5\n",
    "epsilon=(Rv/Rd)-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc9642b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert time from seconds since 1.1.2001, 00:00:00 to datetime\n",
    "def convert_time(base_time, time_array):\n",
    "    return [base_time + timedelta(seconds=int(t)) for t in time_array]\n",
    "# Function to calculate saturation vapor pressure (es) from temperature (T)\n",
    "def calculate_saturation_vapor_pressure(T):\n",
    "    es = 610.78 * np.exp(17.2694 * (T - Ttrip) / (T - 35.86))\n",
    "    return es\n",
    "# Function to calculate specific humidity (qv) from vapor pressure (ev) and atmospheric pressure (p)\n",
    "def calculate_specific_humidity(ev, p):\n",
    "    return ev * 1000 / (p*100 + (((Rv / Rd) - 1) * (p*100 - ev)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35d684cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "# Function to calculate pressure at different heights using numerical integration\n",
    "def calculate_pressure_at_heights(df):\n",
    "    calculated_pressures = []  # To store pressure values for all times and altitudes\n",
    "    \n",
    "    # Group the dataframe by Time\n",
    "    for time, group in df.groupby('Time'):\n",
    "        group = group.sort_values('Altitude').reset_index(drop=True)  # Sort by altitude\n",
    "        \n",
    "        # Start with surface pressure at the first (lowest) altitude\n",
    "        P0 = group['Surface Pressure'].iloc[0] #* 100  # Convert hPa to Pa\n",
    "        pressures = [P0]\n",
    "        \n",
    "        # Perform numerical integration for each altitude step\n",
    "        for i in range(1, len(group)):\n",
    "            z1 = group['Altitude'].iloc[i-1]\n",
    "            z2 = group['Altitude'].iloc[i]\n",
    "            T1 = group['Temperature'].iloc[i-1]\n",
    "            \n",
    "            # Calculate pressure difference using trapezoidal rule\n",
    "            delta_z = z2 - z1\n",
    "            P_prev = pressures[-1]\n",
    "            delta_P = -g * P_prev * delta_z / (Rd * T1)\n",
    "            P_next = P_prev + delta_P\n",
    "            pressures.append(P_next)\n",
    "        \n",
    "        # Store the calculated pressures for this group\n",
    "        calculated_pressures.extend(pressures)\n",
    "    \n",
    "    return calculated_pressures\n",
    "'''\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ffd87ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_pressure_at_heights(df):\n",
    "    calculated_pressures = []  # To store pressure values for all times and altitudes\n",
    "    \n",
    "    # Group the dataframe by Time\n",
    "    for time, group in df.groupby('Time'):\n",
    "        group = group.sort_values('Altitude').reset_index(drop=True)  # Sort by altitude\n",
    "        \n",
    "        # Start with surface pressure at the first (lowest) altitude\n",
    "        P0 = group['Surface Pressure'].iloc[0]  # Initial surface pressure (hPa)\n",
    "        pressures = [P0]  # First pressure is the surface pressure\n",
    "\n",
    "        # Perform second-order (Trapezoidal Rule) integration for each altitude step\n",
    "        for i in range(1, len(group)):\n",
    "            z1 = group['Altitude'].iloc[i - 1]\n",
    "            z2 = group['Altitude'].iloc[i]\n",
    "            T1 = group['Temperature'].iloc[i - 1]\n",
    "            T2 = group['Temperature'].iloc[i]\n",
    "            P1 = pressures[-1]  # Previous pressure value\n",
    "            \n",
    "            delta_z = z2 - z1  # Altitude difference\n",
    "            \n",
    "            # First order estimate of P2 based on P1\n",
    "            P2_guess = P1 * np.exp(-g * delta_z / (Rd * T1))\n",
    "            \n",
    "            # Trapezoidal rule correction for P2\n",
    "            delta_P = 0.5 * (-(g * P1 / (Rd * T1)) + -(g * P2_guess / (Rd * T2))) * delta_z\n",
    "            P2 = P1 + delta_P\n",
    "            pressures.append(P2)\n",
    "        \n",
    "        # Store the calculated pressures for this group\n",
    "        calculated_pressures.extend(pressures)\n",
    "    \n",
    "    return calculated_pressures\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d52b161",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate theta_v using the provided formula\n",
    "def calculate_theta_v(row):\n",
    "    T = row['Temperature']  # Temperature at the current altitude\n",
    "    qv = row['Specific Humidity (qv)']  # Specific humidity\n",
    "    z=row['Altitude']\n",
    "    # Calculate theta_v\n",
    "    theta_v = (T+g*(z/C_p)) * (1 + (epsilon * qv/1000))\n",
    "    return theta_v\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5c63dd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "# Function to calculate potential temperature (theta) based on current pressure level and surface pressure\n",
    "def calculate_potential_temperature(T, P_i, P_0):\n",
    "    \"\"\"\n",
    "    Calculate potential temperature (theta) given temperature T (K) and pressures at level P_i (hPa) and surface pressure P_0 (hPa).\n",
    "    \"\"\"\n",
    "    return T * (P_0 / P_i) ** (Rd / C_p)\n",
    "\n",
    "# Alternative method to calculate theta_v using potential temperature across altitude steps\n",
    "def calculate_theta_v_alternative_by_altitude(df):\n",
    "    \"\"\"\n",
    "    Calculate virtual potential temperature (theta_v) for each time and altitude step.\n",
    "    This function iterates over each altitude for each time step and uses the surface pressure.\n",
    "    \"\"\"\n",
    "    df = df.sort_values('Altitude').reset_index(drop=True)  # Ensure data is sorted by altitude\n",
    "    \n",
    "    theta_v_alternative = []  # Store results\n",
    "    \n",
    "    theta_list   = []\n",
    "    # Loop over rows in the DataFrame (i.e., for each altitude level)\n",
    "    for i in range(len(df)):\n",
    "        T = df.loc[i, 'Temperature'] #+ 273.15  # Convert temperature from Celsius to Kelvin\n",
    "        P_i = df.loc[i, 'Calculated Pressure']# / 100  # Pressure at current altitude in hPa\n",
    "        qv = df.loc[i, 'Specific Humidity (qv)']  # Specific humidity (kg/kg)\n",
    "        \n",
    "        if i == 0:  # At the surface (lowest altitude)\n",
    "            P_0 = P_i  # Surface pressure at the first level\n",
    "            theta = T  # For the first altitude, theta is simply T\n",
    "        else:\n",
    "            P_0 = df.loc[0, 'Calculated Pressure']# / 100  # Surface pressure (remains constant)\n",
    "            # Calculate potential temperature using surface pressure\n",
    "            theta = calculate_potential_temperature(T, P_i, P_0)\n",
    "        \n",
    "        # Calculate theta_v for this level\n",
    "        theta_v_alt = theta * (1 + (epsilon * qv / 1000))\n",
    "        theta_list.append(theta)\n",
    "        # Append the calculated theta_v to the list\n",
    "        theta_v_alternative.append(theta_v_alt)\n",
    "    \n",
    "    return theta_list, theta_v_alternative\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da0d78ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Edit before running!!\n",
    "# Define the folder path\n",
    "folder_path = r\"C:\\path\\to\\your\\Microwave_radiometer\\2024-05\\2024-05-12\" #or other day\n",
    "\n",
    "# Get a list of all .NC files in the folder\n",
    "nc_files = [f for f in os.listdir(folder_path) if f.endswith('.NC')]\n",
    "\n",
    "# Dictionary to store the datasets\n",
    "datasets = {}\n",
    "\n",
    "# Loop through each file and open it\n",
    "for file_name in nc_files:\n",
    "    file_path = os.path.join(folder_path, file_name)\n",
    "    dataset = nc.Dataset(file_path, mode='r')\n",
    "    datasets[file_name] = dataset\n",
    "\n",
    "### MET FILE PROCESSING ###\n",
    "met_file = next((file_name for file_name in datasets.keys() if file_name.upper().endswith('.MET.NC')), None)\n",
    "surf_p_values = None\n",
    "surf_t_values = None\n",
    "time_met = None\n",
    "\n",
    "if met_file in datasets:\n",
    "    met_dataset = datasets[met_file]\n",
    "\n",
    "    try:\n",
    "        # Extract Surf_P, Surf_T, and time\n",
    "        if 'Surf_P' in met_dataset.variables:\n",
    "            surf_p_values = met_dataset.variables['Surf_P'][:]\n",
    "\n",
    "        if 'Surf_T' in met_dataset.variables:\n",
    "            surf_t_values = met_dataset.variables['Surf_T'][:]\n",
    "\n",
    "        if 'time' in met_dataset.variables:\n",
    "            times = met_dataset.variables['time'][:]\n",
    "            base_time = datetime(2001, 1, 1, 0, 0, 0)\n",
    "            time_met = convert_time(base_time, times)\n",
    "    \n",
    "    except Exception as e:\n",
    "        print(f\"Error accessing data in {met_file}: {e}\")\n",
    "\n",
    "# Create DataFrame for MET data\n",
    "df_met = pd.DataFrame({\n",
    "    'Time': time_met,\n",
    "    'Surface Temperature': surf_t_values,\n",
    "    'Surface Pressure': surf_p_values\n",
    "}) if time_met is not None else pd.DataFrame()\n",
    "\n",
    "### TPC FILE PROCESSING ###\n",
    "tpc_file = next((file_name for file_name in datasets.keys() if file_name.upper().endswith('.TPC.NC')), None)\n",
    "time_tpc = None\n",
    "altitude_tpc = None\n",
    "t_profs = None\n",
    "es_profiles = None\n",
    "\n",
    "if tpc_file:\n",
    "    tpc_dataset = datasets[tpc_file]\n",
    "\n",
    "    try:\n",
    "        # Extract time, altitude, and temperature profiles\n",
    "        time_tpc = tpc_dataset.variables['time'][:]\n",
    "        altitude_tpc = tpc_dataset.variables['altitude'][:]\n",
    "        t_profs = tpc_dataset.variables['T_prof'][:]\n",
    "\n",
    "        # Calculate saturation vapor pressure profiles\n",
    "        es_profiles = np.array([calculate_saturation_vapor_pressure(T) for T in t_profs])\n",
    "    \n",
    "    except Exception as e:\n",
    "        print(f\"Error accessing data in {tpc_file}: {e}\")\n",
    "\n",
    "# Create DataFrame for TPC data\n",
    "if time_tpc is not None:\n",
    "    times_tpc = convert_time(base_time, time_tpc)\n",
    "    \n",
    "    # Flattening the data for creating DataFrame\n",
    "    data = []\n",
    "    for i, t in enumerate(times_tpc):\n",
    "        for alt, temp, es in zip(altitude_tpc, t_profs[i, :], es_profiles[i, :]):\n",
    "            data.append([t, alt, temp, es])\n",
    "    \n",
    "    df_tpc = pd.DataFrame(data, columns=['Time', 'Altitude', 'Temperature', 'Saturation Vapor Pressure'])\n",
    "else:\n",
    "    df_tpc = pd.DataFrame()\n",
    "\n",
    "### HPC FILE PROCESSING ###\n",
    "hpc_file = next((file_name for file_name in datasets.keys() if file_name.upper().endswith('.HPC.NC')), None)\n",
    "time_hpc = None\n",
    "altitude_hpc = None\n",
    "rh_profiles = None\n",
    "\n",
    "if hpc_file:\n",
    "    hpc_dataset = datasets[hpc_file]\n",
    "\n",
    "    try:\n",
    "        # Extract time, altitude, and relative humidity profiles\n",
    "        time_hpc = hpc_dataset.variables['time'][:]\n",
    "        altitude_hpc = hpc_dataset.variables['altitude'][:]\n",
    "        rh_profiles = hpc_dataset.variables['RH_prof'][:]\n",
    "    \n",
    "    except Exception as e:\n",
    "        print(f\"Error accessing data in {hpc_file}: {e}\")\n",
    "\n",
    "# Create DataFrame for HPC data\n",
    "if time_hpc is not None:\n",
    "    times_hpc = convert_time(base_time, time_hpc)\n",
    "\n",
    "    # Flattening the data for creating DataFrame\n",
    "    data = []\n",
    "    for i, t in enumerate(times_hpc):\n",
    "        for alt, rh in zip(altitude_hpc, rh_profiles[i, :]):\n",
    "            data.append([t, alt, rh])\n",
    "    \n",
    "    df_hpc = pd.DataFrame(data, columns=['Time', 'Altitude', 'Relative Humidity'])\n",
    "else:\n",
    "    df_hpc = pd.DataFrame()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9997f77d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "### MERGE DATAFRAMES ###\n",
    "\n",
    "# Merge TPC and HPC data on 'Time' and 'Altitude'\n",
    "df_combined = pd.merge(df_hpc, df_tpc, on=['Time', 'Altitude'], how='inner')\n",
    "\n",
    "# Merge with MET data on 'Time'\n",
    "df_combined_pressure = pd.merge(df_met, df_combined, on='Time', how='inner')\n",
    "\n",
    "# Display the first few rows of the final combined DataFrame\n",
    "print(df_combined_pressure.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b10a2dac",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(df_combined_pressure)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a7add40",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\n",
    "# Apply the pressure calculation for each time and altitude\n",
    "df_combined_pressure['Calculated Pressure'] = calculate_pressure_at_heights(df_combined_pressure)\n",
    "\n",
    "# Display the updated DataFrame with calculated pressures\n",
    "print(df_combined_pressure)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c2f2e9c",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Calculate vapor pressure and add it as a new column\n",
    "df_combined_pressure['ev'] = df_combined_pressure['Relative Humidity'] * df_combined_pressure['Saturation Vapor Pressure'] / 100\n",
    "# Apply the specific humidity calculation\n",
    "df_combined_pressure['Specific Humidity (qv)'] = df_combined_pressure.apply(\n",
    "    lambda row: calculate_specific_humidity(row['ev'], row['Calculated Pressure']), axis=1\n",
    ")\n",
    "print(df_combined_pressure)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b65c68a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply the calculation to the DataFrame\n",
    "df_combined_pressure['Theta_v'] = df_combined_pressure.apply(calculate_theta_v, axis=1)\n",
    "print(df_combined_pressure)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06cbe02a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply the altitude-based theta_v calculation for each time step\n",
    "#df_combined_pressure['Theta_v_Alternative'] = df_combined_pressure.groupby('Time').apply(calculate_theta_v_alternative_by_altitude).explode().values\n",
    "# 1) Run groupby.apply once, producing a Series of (theta_list, theta_v_list) tuples\n",
    "grouped = df_combined_pressure.groupby(\"Time\") \\\n",
    "            .apply(calculate_theta_v_alternative_by_altitude)\n",
    "\n",
    "# 2) Extract the “theta_list” (first element of each tuple), explode, and align\n",
    "theta_series = (\n",
    "    grouped\n",
    "      .apply(lambda tup: tup[0])  # take the first element → [θ0, θ1, …]\n",
    "      .explode()                  # flatten into a single long Series\n",
    "      .reset_index(drop=True)     # drop the Time index so it lines up with df_combined_pressure\n",
    ")\n",
    "\n",
    "# 3) Extract the “theta_v_alternative” (second element), explode, and align\n",
    "theta_v_series = (\n",
    "    grouped\n",
    "      .apply(lambda tup: tup[1])  # take the second element → [θᵥ0, θᵥ1, …]\n",
    "      .explode()\n",
    "      .reset_index(drop=True)\n",
    ")\n",
    "\n",
    "# 4) Assign these back as two new columns\n",
    "df_combined_pressure[\"Theta\"] = theta_series.values\n",
    "df_combined_pressure[\"Theta_v_Alternative\"]   = theta_v_series.values\n",
    "\n",
    "# Now each row in df_combined_pressure has both the θ and θᵥ that correspond\n",
    "# to its altitude (grouped by Time).\n",
    "\n",
    "# Display the updated DataFrame with the alternative theta_v column\n",
    "print(df_combined_pressure[['Time', 'Altitude', 'Theta','Theta_v', 'Theta_v_Alternative']].head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66bf7c9e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "'''\n",
    "# Select data for a single time step (e.g., the first unique time in the dataset)\n",
    "selected_time = df_combined_pressure['Time'].unique()[600]  # Get the first unique time\n",
    "\n",
    "# Filter the DataFrame for the selected time\n",
    "df_single_time = df_combined_pressure[df_combined_pressure['Time'] == selected_time]\n",
    "\n",
    "# Create a figure and axis\n",
    "fig, ax1 = plt.subplots(figsize=(10, 6))\n",
    "\n",
    "# Plot Specific Humidity\n",
    "ax1.set_xlabel('Specific Humidity (g/kg)', color='blue')\n",
    "ax1.set_ylabel('Altitude (m)', color='blue')\n",
    "ax1.plot(df_single_time['Specific Humidity (qv)'], df_single_time['Altitude'], label='Specific Humidity (qv)', color='blue')\n",
    "ax1.tick_params(axis='y', labelcolor='blue')\n",
    "\n",
    "# Create a second y-axis for Theta_v\n",
    "ax2 = ax1.twinx()\n",
    "ax2.set_ylabel('Theta_v (K)', color='red')\n",
    "ax2.plot(df_single_time['Theta_v'], df_single_time['Altitude'], label='Theta_v', color='red')\n",
    "ax2.tick_params(axis='y', labelcolor='red')\n",
    "\n",
    "# Set the title and grid\n",
    "plt.title(f'Vertical Profile of Specific Humidity and Theta_v\\nTime: {selected_time}')\n",
    "ax1.grid(True)\n",
    "\n",
    "plt.show()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0bd75ec",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Select data for a single time step (e.g., the first unique time in the dataset)\n",
    "selected_time = df_combined_pressure['Time'].unique()[800]  # Get the first unique time\n",
    "\n",
    "# Filter the DataFrame for the selected time\n",
    "df_single_time = df_combined_pressure[df_combined_pressure['Time'] == selected_time]\n",
    "\n",
    "# Create a figure and axis\n",
    "fig, ax1 = plt.subplots(figsize=(10, 6))\n",
    "\n",
    "# Plot Specific Humidity\n",
    "ax1.set_xlabel('Specific Humidity (g/kg)', color='blue')\n",
    "ax1.set_ylabel('Altitude (m)', color='blue')\n",
    "ax1.plot(df_single_time['Specific Humidity (qv)'], df_single_time['Altitude'], label='Specific Humidity (qv)', color='blue')\n",
    "ax1.tick_params(axis='y', labelcolor='blue')\n",
    "\n",
    "# Create a second y-axis for Theta_v\n",
    "ax2 = ax1.twinx()\n",
    "ax2.set_ylabel('Theta_v (K)', color='red')\n",
    "\n",
    "# Plot Theta_v (original) in red\n",
    "ax2.plot(df_single_time['Theta_v'], df_single_time['Altitude'], label='Theta_v', color='red')\n",
    "\n",
    "# Plot Theta_v_Alternative (new method) in green\n",
    "ax2.plot(df_single_time['Theta_v_Alternative'], df_single_time['Altitude'], label='Theta_v_Alternative', color='green', linestyle='--')\n",
    "\n",
    "# Set y-axis ticks for Theta_v\n",
    "ax2.tick_params(axis='y', labelcolor='red')\n",
    "\n",
    "# Add legend for both Theta_v and Theta_v_Alternative\n",
    "ax2.legend(loc='upper left')\n",
    "\n",
    "# Set the title and grid\n",
    "plt.title(f'Vertical Profile of Specific Humidity and Theta_v\\nTime: {selected_time}')\n",
    "ax1.grid(True)\n",
    "\n",
    "# Show plot\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc341c22",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select data for a single time step (e.g., the first unique time in the dataset)\n",
    "selected_time = df_combined_pressure['Time'].unique()[600]  # Get the selected time\n",
    "\n",
    "# Filter the DataFrame for the selected time\n",
    "df_single_time = df_combined_pressure[df_combined_pressure['Time'] == selected_time]\n",
    "\n",
    "# Create a figure and axis\n",
    "fig, ax = plt.subplots(figsize=(10, 6))\n",
    "\n",
    "# Plot Theta_v (original) in red\n",
    "ax.plot(df_single_time['Temperature'], df_single_time['Altitude'], label='Temperature (K)', color='blue')\n",
    "\n",
    "# Plot Theta_v (original) in red\n",
    "ax.plot(df_single_time['Theta_v'], df_single_time['Altitude'], label='Theta_v (Original)', color='red')\n",
    "\n",
    "# Plot Theta_v_Alternative (new method) in green\n",
    "ax.plot(df_single_time['Theta_v_Alternative'], df_single_time['Altitude'], label='Theta_v (Alternative)', color='green', linestyle='--')\n",
    "\n",
    "# Set axis labels\n",
    "ax.set_xlabel('Theta_v (K)', color='black')\n",
    "ax.set_ylabel('Altitude (m)', color='black')\n",
    "\n",
    "# Add a legend to distinguish the two curves\n",
    "ax.legend(loc='upper left')\n",
    "\n",
    "# Set the title and grid\n",
    "plt.title(f'Vertical Profile of Theta_v (Original and Alternative)\\nTime: {selected_time}')\n",
    "ax.grid(True)\n",
    "plt.ylim(0,450)\n",
    "# Show plot\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56987b4d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Ensure the 'Time' column is a datetime type\n",
    "df_combined_pressure['Time'] = pd.to_datetime(df_combined_pressure['Time'])\n",
    "\n",
    "# Group by 'Time' and aggregate all other columns into lists\n",
    "df_aggregated = df_combined_pressure.groupby('Time').agg(lambda x: list(x)).reset_index()\n",
    "\n",
    "# Display the first few rows of the aggregated DataFrame\n",
    "print(df_aggregated.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35786699",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df_aggregated.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85660919",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\n",
    "# Function to compute the mean profile for arrays\n",
    "def compute_mean_profile(profiles):\n",
    "    if len(profiles) == 0:\n",
    "        return []\n",
    "    profiles_array = np.array(profiles)  # Directly convert to a numpy array\n",
    "    return np.mean(profiles_array, axis=0)\n",
    "\n",
    "# Resample and average function\n",
    "def resample_and_average(df, interval='10T'):\n",
    "    resampled = df.resample(interval).agg({\n",
    "        'Altitude': 'first',  # Assuming altitude doesn't change frequently\n",
    "        'Surface Temperature': lambda x: compute_mean_profile(list(x)),\n",
    "        'Surface Pressure': lambda x: compute_mean_profile(list(x)),\n",
    "        'Relative Humidity': lambda x: compute_mean_profile(list(x)),\n",
    "        'Temperature': lambda x: compute_mean_profile(list(x)),\n",
    "        'Saturation Vapor Pressure': lambda x: compute_mean_profile(list(x)),\n",
    "        'Calculated Pressure': lambda x: compute_mean_profile(list(x)),\n",
    "        'ev': lambda x: compute_mean_profile(list(x)),\n",
    "        'Specific Humidity (qv)': lambda x: compute_mean_profile(list(x)),\n",
    "        'Theta': lambda x: compute_mean_profile(list(x)),\n",
    "        'Theta_v': lambda x: compute_mean_profile(list(x)),\n",
    "        'Theta_v_Alternative': lambda x: compute_mean_profile(list(x)),\n",
    "    })\n",
    "    return resampled\n",
    "\n",
    "# Make sure your DataFrame is indexed by 'Time'\n",
    "df_aggregated.set_index('Time', inplace=True)\n",
    "\n",
    "# Apply the resampling function\n",
    "df_10min_avg = resample_and_average(df_aggregated)\n",
    "\n",
    "# Reset index if you want 'Time' back as a column\n",
    "df_10min_avg.reset_index(inplace=True)\n",
    "\n",
    "# Display the first few rows of the averaged DataFrame\n",
    "print(df_10min_avg.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b782282",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# 1. Build the output‐path inside the same folder\n",
    "parquet_path = os.path.join(folder_path, \"MWR_vertical_dataset_10min.parquet\")\n",
    "\n",
    "# 2. Save `df` to Parquet\n",
    "df_10min_avg.to_parquet(parquet_path, engine=\"pyarrow\", index=False)\n",
    "\n",
    "print(f\"Saved DataFrame to: {parquet_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99f97267",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Select the 10th entry in the averaged DataFrame\n",
    "example_data = df_10min_avg.iloc[60]  # Indexing starts at 0, so 9 is the 10th entry\n",
    "\n",
    "# Create subplots in one row\n",
    "fig, axs = plt.subplots(1, 4, figsize=(20, 5), sharey=True)\n",
    "\n",
    "# Plot Temperature\n",
    "axs[0].plot(example_data['Temperature'], example_data['Altitude'], label='Temperature (K)', color='red')\n",
    "axs[0].plot(example_data['Theta'], example_data['Altitude'], label='Theta (K)', color='green')\n",
    "\n",
    "axs[0].set_title('Temperature (K)', fontsize=14)\n",
    "axs[0].legend()   # ← add legend here\n",
    "\n",
    "axs[0].grid()\n",
    "\n",
    "# Plot Theta_v\n",
    "axs[1].plot(example_data['Theta_v'], example_data['Altitude'], label='Theta_v (K)', color='orange')\n",
    "axs[1].set_title('Theta_v (K)', fontsize=14)\n",
    "axs[1].grid()\n",
    "\n",
    "# Plot Relative Humidity (RH)\n",
    "axs[2].plot(example_data['Relative Humidity'], example_data['Altitude'], label='Relative Humidity (%)', color='blue')\n",
    "axs[2].set_title('Relative Humidity (%)', fontsize=14)\n",
    "axs[2].grid()\n",
    "\n",
    "# Plot Specific Humidity (qv)\n",
    "axs[3].plot(example_data['Specific Humidity (qv)'], example_data['Altitude'], label='Specific Humidity (g/kg)', color='green')\n",
    "axs[3].set_title('Specific Humidity (g/kg)', fontsize=14)\n",
    "axs[3].set_xlabel('Value')\n",
    "axs[3].grid()\n",
    "\n",
    "# Set shared y-axis limits\n",
    "axs[0].set_ylim(0,2000)  # Reverse y-axis for altitude\n",
    "axs[0].set_ylabel('Altitude (m)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.suptitle(f'Profile at Time: {example_data[\"Time\"]}', fontsize=16, y=1.02)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c341dee",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Select the 10th entry in the averaged DataFrame\n",
    "example_data = df_10min_avg.iloc[99]  # Indexing starts at 0, so 9 is the 10th entry\n",
    "\n",
    "# Create a single plot for Specific Humidity\n",
    "fig, ax = plt.subplots(figsize=(8, 6))  # Larger figure for presentation\n",
    "\n",
    "# Plot Specific Humidity (qv)\n",
    "ax.plot(example_data['Specific Humidity (qv)'], example_data['Altitude'], label='Specific Humidity (g/kg)', color='green')\n",
    "\n",
    "# Set the title and labels with larger font sizes\n",
    "ax.set_title('Specific Humidity (g/kg)', fontsize=18)\n",
    "ax.set_xlabel('Specific Humidity (g/kg)', fontsize=16)\n",
    "ax.set_ylabel('Altitude (m)', fontsize=16)\n",
    "\n",
    "# Increase tick label font size\n",
    "ax.tick_params(axis='both', which='major', labelsize=14)\n",
    "\n",
    "# Add a grid for better visibility\n",
    "ax.grid()\n",
    "\n",
    "# Add some space between the plot and the title\n",
    "plt.tight_layout()\n",
    "\n",
    "# Display the plot\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70e4a142",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select the 10th entry in the averaged DataFrame\n",
    "example_data = df_10min_avg.iloc[99]  # Indexing starts at 0, so 79 is the specific entry\n",
    "\n",
    "# Create a single plot for Theta_v and Theta_v_Alternative\n",
    "fig, ax = plt.subplots(figsize=(8, 6))  # Larger figure for presentation\n",
    "\n",
    "# Plot Theta_v\n",
    "ax.plot(example_data['Theta_v'], example_data['Altitude'], label='Theta_v (K)', color='orange')\n",
    "\n",
    "# Plot Theta_v_Alternative\n",
    "ax.plot(example_data['Theta_v_Alternative'], example_data['Altitude'], label='Theta_v_Alternative (K)', color='purple', linestyle='--')\n",
    "\n",
    "# Set the title and labels with larger font sizes\n",
    "ax.set_title('Vertical Profile of Potential Temperature', fontsize=18)\n",
    "ax.set_xlabel('Theta_v (K)', fontsize=16)\n",
    "ax.set_ylabel('Altitude (m)', fontsize=16)\n",
    "\n",
    "# Increase tick label font size\n",
    "ax.tick_params(axis='both', which='major', labelsize=14)\n",
    "\n",
    "# Add a grid for better visibility\n",
    "ax.grid()\n",
    "\n",
    "# Add legend with larger font size for better presentation visibility\n",
    "ax.legend(fontsize=14)\n",
    "\n",
    "# Add some space between the plot and the title\n",
    "plt.tight_layout()\n",
    "\n",
    "# Display the plot\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9663a6ce",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "\n",
    "# Select the 10th entry in the averaged DataFrame\n",
    "example_data = df_10min_avg.iloc[76]  # Indexing starts at 0, so 9 is the 10th entry\n",
    "\n",
    "# Create subplots in one row\n",
    "fig, axs = plt.subplots(1, 4, figsize=(20, 5), sharey=True)\n",
    "\n",
    "# Scatter plot for Temperature\n",
    "axs[0].scatter(example_data['Temperature'], example_data['Altitude'], color='red')\n",
    "axs[0].set_title('Temperature (K)', fontsize=14)\n",
    "axs[0].set_xlabel('Value')\n",
    "axs[0].grid()\n",
    "\n",
    "# Scatter plot for Theta_v\n",
    "axs[1].scatter(example_data['Theta_v'], example_data['Altitude'], color='orange')\n",
    "axs[1].set_title('Theta_v (K)', fontsize=14)\n",
    "axs[1].set_xlabel('Value')\n",
    "axs[1].grid()\n",
    "\n",
    "# Scatter plot for Relative Humidity (RH)\n",
    "axs[2].scatter(example_data['Relative Humidity'], example_data['Altitude'], color='blue')\n",
    "axs[2].set_title('Relative Humidity (%)', fontsize=14)\n",
    "axs[2].set_xlabel('Value')\n",
    "axs[2].grid()\n",
    "\n",
    "# Scatter plot for Specific Humidity (qv)\n",
    "axs[3].scatter(example_data['Specific Humidity (qv)'], example_data['Altitude'], color='green')\n",
    "axs[3].set_title('Specific Humidity (g/kg)', fontsize=14)\n",
    "axs[3].set_xlabel('Value')\n",
    "axs[3].grid()\n",
    "\n",
    "# Add a horizontal line at 2 km\n",
    "for ax in axs:\n",
    "    ax.axhline(y=2000, color='red', linestyle='--')\n",
    "    ax.set_ylim(0, 3000)  # Reverse y-axis for altitude\n",
    "    ax.set_ylabel('Altitude (m)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.suptitle(f'Profile at Time: {example_data[\"Time\"]}', fontsize=16, y=1.02)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c1e2d08",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select the 10th entry in the averaged DataFrame\n",
    "example_data = df_10min_avg.iloc[36]  # Indexing starts at 0, so 38 is the selected entry\n",
    "\n",
    "# Create a figure for Temperature and Theta_v\n",
    "plt.figure(figsize=(10, 6))\n",
    "\n",
    "# Scatter plot for Temperature\n",
    "plt.scatter(example_data['Temperature'], example_data['Altitude'], color='red', label='Temperature (K)', alpha=0.7)\n",
    "\n",
    "# Scatter plot for Theta_v\n",
    "plt.scatter(example_data['Theta_v'], example_data['Altitude'], color='orange', label='Theta_v (K)', alpha=0.7)\n",
    "\n",
    "# Add a horizontal line at 2 km\n",
    "plt.axhline(y=2000, color='red', linestyle='--', label='2 km Threshold')\n",
    "\n",
    "# Set labels and title\n",
    "plt.title(f'Temperature and Theta_v Profile at Time: {example_data[\"Time\"]}', fontsize=16)\n",
    "plt.xlabel('Value (K)', fontsize=14)\n",
    "plt.ylabel('Altitude (m)', fontsize=14)\n",
    "\n",
    "# Add a legend\n",
    "plt.legend()\n",
    "#plt.ylim(0,1500)\n",
    "# Set grid\n",
    "plt.grid()\n",
    "\n",
    "# Show the plot\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "86bc6571",
   "metadata": {},
   "source": [
    "### BLH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d527a7e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Load the Boundary Layer Height (BLH) data\n",
    "blh_file = next((file_name for file_name in datasets.keys() if file_name.upper().endswith('.BLH.NC')), None)\n",
    "time_blh = None\n",
    "blh_values = None\n",
    "\n",
    "if blh_file in datasets:\n",
    "    blh_dataset = datasets[blh_file]\n",
    "\n",
    "    try:\n",
    "        # Extract time and BLH\n",
    "        if 'time' in blh_dataset.variables:\n",
    "            time_var = blh_dataset.variables['time'][:]\n",
    "            time_blh = [datetime(2001, 1, 1) + timedelta(seconds=int(t)) for t in time_var]\n",
    "\n",
    "        if 'BLH' in blh_dataset.variables:\n",
    "            blh_values = blh_dataset.variables['BLH'][:]\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"Error accessing data in {blh_file}: {e}\")\n",
    "\n",
    "# Create DataFrame for BLH data\n",
    "df_blh = pd.DataFrame({\n",
    "    'Time': time_blh,\n",
    "    'BLH': blh_values\n",
    "}) if time_blh is not None and blh_values is not None else pd.DataFrame()\n",
    "\n",
    "# Close the datasets\n",
    "for dataset in datasets.values():\n",
    "    dataset.close()\n",
    "    \n",
    "print(df_blh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef5e142f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\n",
    "# Check if the DataFrame is not empty\n",
    "if not df_blh.empty:\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    plt.plot(df_blh['Time'], df_blh['BLH'], marker='o', linestyle='-', color='b')\n",
    "    plt.title('Boundary Layer Height Over Time')\n",
    "    plt.xlabel('Time')\n",
    "    plt.ylabel('Boundary Layer Height (m)')\n",
    "    plt.xticks(rotation=45)\n",
    "    plt.grid()\n",
    "    plt.tight_layout()  # Adjust layout to prevent clipping of tick-labels\n",
    "    plt.show()\n",
    "else:\n",
    "    print(\"No data to plot.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87537086",
   "metadata": {},
   "outputs": [],
   "source": [
    "def estimate_zi_from_theta_v(theta_v, z):\n",
    "    \"\"\"Estimate CBL height from theta_v profile using gradient method.\"\"\"\n",
    "    dthetav_dz = np.gradient(theta_v, z)\n",
    "    dthetav_dz_smooth = gaussian_filter1d(dthetav_dz, sigma=1)  # Optional smoothing\n",
    "    zi_index = np.argmax(dthetav_dz_smooth)\n",
    "    return z[zi_index] if zi_index < len(z) else np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cafa7697",
   "metadata": {},
   "outputs": [],
   "source": [
    "zi_list = []\n",
    "zi_times = []\n",
    "\n",
    "for idx, row in df_10min_avg.iterrows():\n",
    "    z = row['Altitude']                # height array\n",
    "    theta_v = row['Theta_v_Alternative']    # already-computed profile\n",
    "\n",
    "    zi = estimate_zi_from_theta_v(theta_v, z)\n",
    "    zi_list.append(zi)\n",
    "    zi_times.append(row['Time'])\n",
    "\n",
    "zi_df = pd.DataFrame({'Time': zi_times, 'zi': zi_list})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d825eb77",
   "metadata": {},
   "outputs": [],
   "source": [
    "def estimate_zi_parcel_method(theta_v_profile, z_profile, theta_v_surface):\n",
    "    \"\"\"\n",
    "    Estimate boundary layer height (zi) using parcel method.\n",
    "    Finds first height where environmental theta_v > surface theta_v.\n",
    "    \n",
    "    Parameters:\n",
    "        theta_v_profile: array-like of virtual potential temperature with height\n",
    "        z_profile: array-like of corresponding height levels (same length)\n",
    "        theta_v_surface: surface virtual potential temperature (scalar)\n",
    "        \n",
    "    Returns:\n",
    "        Estimated boundary layer height (zi)\n",
    "    \"\"\"\n",
    "    for i, theta_env in enumerate(theta_v_profile):\n",
    "        if theta_env > theta_v_surface:\n",
    "            return z_profile[i]\n",
    "    return np.nan  # If no crossing is found"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9744fb4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "zi_parcel_list = []\n",
    "zi_parcel_times = []\n",
    "\n",
    "for idx, row in df_10min_avg.iterrows():\n",
    "    z = row['Altitude']                # height levels\n",
    "    theta_v = row['Theta_v_Alternative']  # profile (list or array)\n",
    "    \n",
    "    theta_v_surface = theta_v[0]  # assume surface is first level\n",
    "    zi = estimate_zi_parcel_method(theta_v, z, theta_v_surface)\n",
    "    \n",
    "    zi_parcel_list.append(zi)\n",
    "    zi_parcel_times.append(row['Time'])\n",
    "\n",
    "zi_parcel_df = pd.DataFrame({'Time': zi_parcel_times, 'zi_parcel': zi_parcel_list})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97a36ec4",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "\n",
    "plt.figure(figsize=(10, 5))\n",
    "plt.plot(zi_df['Time'], zi_df['zi'], label='CBL Height $z_i$', color='blue')\n",
    "plt.xlabel('Time (UTC)')\n",
    "plt.ylabel('Height (m)')\n",
    "plt.title('Estimated CBL Height on April 20, 2024')\n",
    "plt.grid(True)\n",
    "plt.legend()\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7fca03f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 5))\n",
    "\n",
    "# Plot gradient method\n",
    "plt.plot(zi_df['Time'], zi_df['zi'], label='CBL Height $z_i$ (Gradient Method)', color='blue')\n",
    "\n",
    "# Plot parcel method (make sure zi_parcel_df is created as shown earlier)\n",
    "plt.plot(zi_parcel_df['Time'], zi_parcel_df['zi_parcel'], \n",
    "         label='CBL Height $z_i$ (Parcel Method)', color='orange', linestyle='--')\n",
    "\n",
    "plt.xlabel('Time (UTC)')\n",
    "plt.ylabel('Height (m)')\n",
    "plt.title('Estimated CBL Height on May 23, 2024')\n",
    "plt.grid(True, linestyle='--', alpha=0.7)\n",
    "plt.legend()\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7553fab7",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(zi_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c479720",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add parcel method CBL height to the DataFrame\n",
    "zi_df['zi_parcel'] = zi_parcel_df['zi_parcel']\n",
    "\n",
    "\n",
    "output_file = os.path.join(folder_path, 'cbl_height_2024-05-12.csv')\n",
    "\n",
    "# Save the DataFrame\n",
    "zi_df.to_csv(output_file, index=False)\n",
    "print(f\"CBL height saved to: {output_file}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
