{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "681681d7",
   "metadata": {},
   "source": [
    "### What this script does \n",
    "\n",
    "- Reads Cloudnet HATPRO daily NetCDF files → extracts IWV and LWP, concatenates, and resamples to 10-min.\n",
    "\n",
    "- Reads Cabauw CESAR surface radiation NetCDF files → extracts SWD, SWU, LWD, LWU and combines.\n",
    "\n",
    "- Merges HATPRO (10-min) with Cabauw radiation on TIMESTAMP.\n",
    "\n",
    "- Saves the merged dataset to Parquet.\n",
    "\n",
    "#### Edit before running\n",
    "1) Single-file inspection (optional): point to a specific NetCDF to introspect\n",
    "\n",
    "   file_path = r\"C:\\path\\to\\your\\Cabauw\\cloudnet-collection-...\\20240515_cabauw_hatpro_c28c803c.nc\"\n",
    "\n",
    "2) Folder of daily HATPRO files (IWV/LWP):\n",
    "   \n",
    "   folder_path = r\"C:\\path\\to\\your\\Cabauw\\cloudnet-collection-a38fe13808684f78\"\n",
    "\n",
    "3) Folder of CESAR radiation NetCDFs (SWD/SWU/LWD/LWU):\n",
    "\n",
    "   folder_path = r\"C:\\path\\to\\your\\Cabauw\"\n",
    "\n",
    "4) Output Parquet path for the merged dataset:\n",
    "   \n",
    "   output_path = r\"C:\\path\\to\\your\\Cabauw\\cabauw_merged_data_23-24.parquet\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "450f252c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import netCDF4 as nc\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import datetime as dt\n",
    "from datetime import datetime, timedelta\n",
    "from matplotlib.backends.backend_pdf import PdfPages\n",
    "\n",
    "import imageio\n",
    "from reportlab.lib.pagesizes import letter\n",
    "from reportlab.pdfgen import canvas\n",
    "from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle\n",
    "from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd99841c",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\n",
    "#Edit before running!!\n",
    "# 1) Single-file inspection (optional): point to a specific NetCDF to introspect\n",
    "# Example: file_path = r\"D:\\Thesis\\Cabauw\\cloudnet-collection-...\\20240515_cabauw_hatpro_*.nc\"\n",
    "file_path = r\"C:\\path\\to\\your\\Cabauw\\cloudnet-collection-...\\20240515_cabauw_hatpro_c28c803c.nc\"\n",
    "\n",
    "# Check if the file exists\n",
    "if os.path.exists(file_path):\n",
    "    try:\n",
    "        with nc.Dataset(file_path, 'r') as dataset:\n",
    "            print(\"\\n File opened successfully!\")\n",
    "\n",
    "            # Print file-level metadata\n",
    "            print(\"\\n File Global Attributes:\")\n",
    "            for attr in dataset.ncattrs():\n",
    "                print(f\"   {attr}: {getattr(dataset, attr)}\")\n",
    "\n",
    "            # Print all variables\n",
    "            print(\"\\n Variables Overview:\\n\")\n",
    "            for var_name in dataset.variables:\n",
    "                var = dataset.variables[var_name]\n",
    "                print(f\" Variable: {var_name}\")\n",
    "                print(f\"   - Dimensions: {var.dimensions}\")\n",
    "                print(f\"   - Shape: {var.shape}\")\n",
    "                \n",
    "                # Try to print units if available\n",
    "                units = getattr(var, \"units\", \"N/A\")\n",
    "                print(f\"   - Units: {units}\")\n",
    "\n",
    "                # Try to print a short preview of the data (first few values)\n",
    "                try:\n",
    "                    preview = var[:5] if var.ndim == 1 else var[:5, ...]\n",
    "                    print(f\"   - Sample values: {preview}\\n\")\n",
    "                except Exception as e:\n",
    "                    print(f\"   - Could not preview values: {e}\\n\")\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\" Error reading the NetCDF file: {e}\")\n",
    "else:\n",
    "    print(\" File not found:\", file_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43bbf822",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Edit before running!!\n",
    "# 2) Folder of daily HATPRO files (IWV/LWP)\n",
    "# Must contain many \"*cabauw_hatpro*.nc\" files\n",
    "folder_path = r\"C:\\path\\to\\your\\Cabauw\\cloudnet-collection-a38fe13808684f78\"\n",
    "\n",
    "# Initialize list to collect dataframes\n",
    "df_list = []\n",
    "\n",
    "# Loop over all files in folder\n",
    "for filename in sorted(os.listdir(folder_path)):\n",
    "    if filename.endswith(\".nc\") and \"cabauw_hatpro\" in filename:\n",
    "        file_path = os.path.join(folder_path, filename)\n",
    "\n",
    "        try:\n",
    "            with nc.Dataset(file_path, 'r') as ds:\n",
    "                print(f\"✅ Reading: {filename}\")\n",
    "\n",
    "                # Extract time and convert to datetime\n",
    "                time_var = ds.variables['time']\n",
    "                time_units = time_var.units\n",
    "                time_data = time_var[:]\n",
    "                time_dt = nc.num2date(time_data, units=time_units)\n",
    "\n",
    "                # Extract IWV and LWP\n",
    "                iwv = ds.variables['iwv'][:]\n",
    "                lwp = ds.variables['lwp'][:]\n",
    "\n",
    "                # Get units\n",
    "                iwv_units = getattr(ds.variables['iwv'], \"units\", \"N/A\")\n",
    "                lwp_units = getattr(ds.variables['lwp'], \"units\", \"N/A\")\n",
    "\n",
    "                # Create DataFrame for this file\n",
    "                df_day = pd.DataFrame({\n",
    "                    'TIMESTAMP': time_dt,\n",
    "                    f'IWV ({iwv_units})': iwv,\n",
    "                    f'LWP ({lwp_units})': lwp\n",
    "                })\n",
    "\n",
    "                df_list.append(df_day)\n",
    "\n",
    "        except Exception as e:\n",
    "            print(f\"⚠️ Error reading {filename}: {e}\")\n",
    "\n",
    "# Combine all days\n",
    "df_all_hatpro = pd.concat(df_list, ignore_index=True)\n",
    "\n",
    "# Show a preview\n",
    "print(\"\\n✅ Combined HATPRO Data:\")\n",
    "print(df_all_hatpro.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cac019d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert cftime to native Python datetime using .isoformat() and then pd.Timestamp\n",
    "df_all_hatpro['TIMESTAMP'] = [pd.Timestamp(t.isoformat()) for t in df_all_hatpro['TIMESTAMP']]\n",
    "\n",
    "# Set as index\n",
    "df_all_hatpro.set_index('TIMESTAMP', inplace=True)\n",
    "\n",
    "# Resample to 10-minute intervals (mean)\n",
    "df_10min = df_all_hatpro.resample('10T').mean()\n",
    "\n",
    "# Reset index\n",
    "df_10min.reset_index(inplace=True)\n",
    "\n",
    "# Preview\n",
    "print(df_10min)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb78a6cb",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Edit before running!! \n",
    "#3) Folder of CESAR radiation NetCDFs (SWD/SWU/LWD/LWU)\n",
    "# Looks for files matching \"*cesar_surface_radiation*.nc\"\n",
    "folder_path = r\"C:\\path\\to\\your\\Cabauw\"\n",
    "\n",
    "# Prepare a list to collect all DataFrames\n",
    "all_radiation_dfs = []\n",
    "\n",
    "# Loop through all files in the folder\n",
    "for filename in os.listdir(folder_path):\n",
    "    if filename.endswith(\".nc\") and \"cesar_surface_radiation\" in filename:\n",
    "        file_path = os.path.join(folder_path, filename)\n",
    "        with nc.Dataset(file_path, 'r') as ds:\n",
    "            time_var = ds.variables['time']\n",
    "            time_units = time_var.units\n",
    "            time_data = time_var[:]\n",
    "            time_dt = nc.num2date(time_data, units=time_units)\n",
    "\n",
    "            swd = ds.variables['SWD'][:]\n",
    "            swu = ds.variables['SWU'][:]\n",
    "            lwd = ds.variables['LWD'][:]\n",
    "            lwu = ds.variables['LWU'][:]\n",
    "\n",
    "        # Convert to Python datetime\n",
    "        time_dt_py = [pd.Timestamp(t.isoformat()) for t in time_dt]\n",
    "\n",
    "        # Create DataFrame\n",
    "        df = pd.DataFrame({\n",
    "            'TIMESTAMP': time_dt_py,\n",
    "            'SWD': swd,\n",
    "            'SWU': swu,\n",
    "            'LWD': lwd,\n",
    "            'LWU': lwu\n",
    "        })\n",
    "\n",
    "        # Round timestamps to the nearest second\n",
    "        df['TIMESTAMP'] = df['TIMESTAMP'].round('S')\n",
    "\n",
    "        # Append to list\n",
    "        all_radiation_dfs.append(df)\n",
    "\n",
    "# Combine all dataframes\n",
    "df_radiation = pd.concat(all_radiation_dfs, ignore_index=True)\n",
    "\n",
    "# Sort by time\n",
    "df_radiation.sort_values('TIMESTAMP', inplace=True)\n",
    "\n",
    "# Reset index\n",
    "df_radiation.reset_index(drop=True, inplace=True)\n",
    "\n",
    "# ✅ Preview the combined DataFrame\n",
    "print(df_radiation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7261922d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Merge based on TIMESTAMP\n",
    "df_merged = pd.merge(df_10min, df_radiation, on='TIMESTAMP', how='inner')\n",
    "\n",
    "# Preview\n",
    "print(f\"\\n✅ Merged DataFrame shape: {df_merged.shape}\")\n",
    "print(df_merged.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cdb449b",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df_merged)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e301d8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Edit before running!!\n",
    "# 4) Output Parquet path for the merged dataset\n",
    "output_path = r\"C:\\path\\to\\your\\Cabauw\\cabauw_merged_data_23-24.parquet\"\n",
    "\n",
    "# Save the DataFrame\n",
    "df_merged.to_parquet(output_path, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
