{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "944c6bda",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import os\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "from datetime import datetime, timedelta\n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f66bc9ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_file(url, file_path):\n",
    "    \"\"\"\n",
    "    Download a file from a URL and save it to a specified path.\n",
    "\n",
    "    :param url: URL to download\n",
    "    :param file_path: Local path to save the downloaded file\n",
    "    \"\"\"\n",
    "    try:\n",
    "        with requests.get(url, stream=True, timeout=REQUEST_TIMEOUT, headers=HEADERS) as response:\n",
    "            response.raise_for_status()  # Raise an error for bad status\n",
    "            os.makedirs(os.path.dirname(file_path), exist_ok=True)\n",
    "            with open(file_path, 'wb') as file:\n",
    "                for chunk in response.iter_content(chunk_size=8192):\n",
    "                    if chunk:\n",
    "                        file.write(chunk)\n",
    "        print(f\"Downloaded: {file_path}\")\n",
    "    except requests.exceptions.RequestException as e:\n",
    "        print(f\"Failed to download {url}: {e}\")\n",
    "\n",
    "def download_nc_files_from_directory(base_url, day_dir):\n",
    "    \"\"\"\n",
    "    Download all .nc files from a given directory using parallel downloads.\n",
    "\n",
    "    :param base_url: Base URL for the directory containing .nc files (an HTML listing)\n",
    "    :param day_dir: Local directory to save the downloaded files\n",
    "\n",
    "    NOTE:\n",
    "    - By default this filters for files that end with 'LV1.nc'. If your server uses a\n",
    "      different pattern, change the check below (e.g. to '.nc').\n",
    "    \"\"\"\n",
    "    try:\n",
    "        response = requests.get(base_url, timeout=REQUEST_TIMEOUT, headers=HEADERS)\n",
    "        response.raise_for_status()\n",
    "        soup = BeautifulSoup(response.text, 'html.parser')\n",
    "\n",
    "        # Gather all .nc file URLs\n",
    "        urls = []\n",
    "        for link in soup.find_all('a'):\n",
    "            href = link.get('href')\n",
    "            if not href:\n",
    "                continue\n",
    "            # Adjust this line if you want all .nc files:\n",
    "            #   if href.endswith('.nc'):\n",
    "            # For Cloud Radar \"LV1.nc\" only:\n",
    "            if href.endswith('LV1.nc'):\n",
    "                # Prevent path traversal; only use the basename for local writes\n",
    "                name = os.path.basename(href)\n",
    "                file_url = f\"{base_url.rstrip('/')}/{href}\"\n",
    "                file_path = os.path.join(day_dir, name)\n",
    "                urls.append((file_url, file_path))\n",
    "\n",
    "        # Download files in parallel\n",
    "        with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "            future_to_url = {executor.submit(download_file, url, path): url for url, path in urls}\n",
    "            for future in as_completed(future_to_url):\n",
    "                url = future_to_url[future]\n",
    "                try:\n",
    "                    future.result()\n",
    "                except Exception as e:\n",
    "                    print(f\"Error downloading {url}: {e}\")\n",
    "\n",
    "    except requests.exceptions.RequestException as e:\n",
    "        print(f\"Failed to access {base_url}: {e}\")\n",
    "\n",
    "def download_files_for_month(base_url_template, start_date, end_date, base_dir):\n",
    "    \"\"\"\n",
    "    Download files for each day between start_date and end_date (inclusive).\n",
    "\n",
    "    :param base_url_template: URL template with placeholders {year}, {month}, {day}\n",
    "                              Example: \"https://YOUR_PRIVATE_HOST/path/Y{year}/M{month}/D{day}\"\n",
    "    :param start_date: Start date (format: 'YYYY-MM-DD')\n",
    "    :param end_date: End date (format: 'YYYY-MM-DD')\n",
    "    :param base_dir: Base local directory where files are stored (created if missing)\n",
    "\n",
    "    HOW TO USE base_url_template:\n",
    "    - Replace YOUR_PRIVATE_HOST and path with your server structure.\n",
    "    - Keep the {year}/{month}/{day} placeholders exactly as shown.\n",
    "    - Example for a public-safe placeholder:\n",
    "      \"https://example.org/dataset/Y{year}/M{month}/D{day}\"\n",
    "    \"\"\"\n",
    "    current_date = datetime.strptime(start_date, '%Y-%m-%d')\n",
    "    end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')\n",
    "\n",
    "    while current_date <= end_date_obj:\n",
    "        date_str = current_date.strftime('%Y-%m-%d')\n",
    "        year_month = current_date.strftime('%Y-%m')\n",
    "        day_str = current_date.strftime('%d')\n",
    "\n",
    "        # Use the provided template instead of a hard-coded URL\n",
    "        base_url = base_url_template.format(\n",
    "            year=current_date.strftime('%Y'),\n",
    "            month=current_date.strftime('%m'),\n",
    "            day=day_str\n",
    "        )\n",
    "\n",
    "        day_dir = os.path.join(base_dir, year_month, date_str)\n",
    "        print(f\"Downloading files for {date_str} from {base_url} ...\")\n",
    "        download_nc_files_from_directory(base_url, day_dir)\n",
    "\n",
    "        current_date += timedelta(days=1)\n",
    "\n",
    "# -------------------------\n",
    "# EXAMPLES\n",
    "# -------------------------\n",
    "# 1) Public-safe placeholder (edit to your server before running):\n",
    "# base_url_template = \"https://example.org/dataset/Y{year}/M{month}/D{day}\"\n",
    "#\n",
    "# 2) If you need Cloud Radar with 'LV1.nc' suffix, keep the default filter above.\n",
    "#    To download ALL .nc files instead, change `href.endswith('LV1.nc')` to `href.endswith('.nc')`.\n",
    "#\n",
    "# 3) Choose a neutral, repo-local output directory:\n",
    "# base_dir = \"./data/cloud_radar\"\n",
    "#\n",
    "# 4) Pick dates:\n",
    "# start_date = \"2024-03-21\"\n",
    "# end_date   = \"2024-03-31\"\n",
    "#\n",
    "# 5) Run:\n",
    "# download_files_for_month(base_url_template, start_date, end_date, base_dir)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17cd80c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_files_from_directory(base_url, day_dir, file_extension):\n",
    "    \"\"\"\n",
    "    Download all files with a given extension from a directory using parallel downloads.\n",
    "\n",
    "    :param base_url: Base URL for the directory containing the files (HTML listing page)\n",
    "    :param day_dir: Local directory to save the downloaded files\n",
    "    :param file_extension: File extension to filter (e.g., '.nc', '.dat')\n",
    "\n",
    "    NOTE: Keep your real server URL private. In public code, pass it at runtime or\n",
    "    store it in an environment variable. See usage examples at the bottom.\n",
    "    \"\"\"\n",
    "    try:\n",
    "        response = requests.get(base_url, timeout=REQUEST_TIMEOUT, headers=HEADERS)\n",
    "        response.raise_for_status()\n",
    "        soup = BeautifulSoup(response.text, 'html.parser')\n",
    "\n",
    "        # Gather all file URLs with the given extension\n",
    "        urls = []\n",
    "        for link in soup.find_all('a'):\n",
    "            href = link.get('href')\n",
    "            if not href:\n",
    "                continue\n",
    "            if href.endswith(file_extension):\n",
    "                # Build absolute URL safely and prevent path traversal locally\n",
    "                name = os.path.basename(href)\n",
    "                file_url = f\"{base_url.rstrip('/')}/{href}\"\n",
    "                file_path = os.path.join(day_dir, name)\n",
    "                urls.append((file_url, file_path))\n",
    "\n",
    "        # Download files in parallel\n",
    "        with ThreadPoolExecutor(max_workers=10) as executor:\n",
    "            future_to_url = {executor.submit(download_file, url, path): url for url, path in urls}\n",
    "            for future in as_completed(future_to_url):\n",
    "                url = future_to_url[future]\n",
    "                try:\n",
    "                    future.result()\n",
    "                except Exception as e:\n",
    "                    print(f\"Error downloading {url}: {e}\")\n",
    "\n",
    "    except requests.exceptions.RequestException as e:\n",
    "        print(f\"Failed to access {base_url}: {e}\")\n",
    "\n",
    "\n",
    "def download_sonic_anemometer_files_for_month(base_url_template, start_date, end_date, base_dir):\n",
    "    \"\"\"\n",
    "    Download all Sonic Anemometer files with '.dat' extension for each day in [start_date, end_date].\n",
    "\n",
    "    :param base_url_template: URL template with placeholders {year}, {month}, {day}\n",
    "                              Example (keep private; don't commit your real host):\n",
    "                              \"https://YOUR_PRIVATE_HOST/path/Y{year}/M{month}/D{day}\"\n",
    "    :param start_date: Start date (format: 'YYYY-MM-DD')\n",
    "    :param end_date: End date (format: 'YYYY-MM-DD')\n",
    "    :param base_dir: Base directory where files should be stored locally\n",
    "\n",
    "    HOW TO USE base_url_template (without exposing private URLs in the repo):\n",
    "    - In code (private script) or via an environment variable at runtime.\n",
    "    - Keep the {year}/{month}/{day} placeholders unchanged.\n",
    "    \"\"\"\n",
    "    current_date = datetime.strptime(start_date, '%Y-%m-%d')\n",
    "    end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')\n",
    "\n",
    "    while current_date <= end_date_obj:\n",
    "        date_str = current_date.strftime('%Y-%m-%d')\n",
    "        year_month = current_date.strftime('%Y-%m')\n",
    "        day_str = current_date.strftime('%d')\n",
    "        month_str = current_date.strftime('%m')\n",
    "\n",
    "        base_url = base_url_template.format(\n",
    "            year=current_date.strftime('%Y'),\n",
    "            month=month_str,\n",
    "            day=day_str\n",
    "        )\n",
    "\n",
    "        day_dir = os.path.join(base_dir, year_month, date_str)\n",
    "        if not os.path.exists(day_dir):\n",
    "            os.makedirs(day_dir)\n",
    "\n",
    "        print(f\"Downloading Sonic Anemometer files for {date_str}...\")\n",
    "        download_files_from_directory(base_url, day_dir, '.dat')\n",
    "\n",
    "        current_date += timedelta(days=1)\n",
    "\n",
    "\n",
    "# -------------------------\n",
    "# EXAMPLES \n",
    "# -------------------------\n",
    "# Do NOT hard-code private hosts or personal paths in your public repo.\n",
    "# Instead, set them at runtime or via environment variables.\n",
    "\n",
    "# Example (placeholder) — replace at runtime, not in the repo:\n",
    "# base_url_template_sonic = \"https://example.org/Sonic_Anemometer/Y{year}/M{month}/D{day}\"\n",
    "#\n",
    "# Suggest a neutral local directory inside the repo (ok to commit empty, but\n",
    "# add 'data/' to .gitignore so downloads are not committed):\n",
    "# base_dir_sonic = \"./data/sonic\"\n",
    "#\n",
    "# start_date_sonic = \"2024-07-01\"\n",
    "# end_date_sonic   = \"2024-07-10\"\n",
    "#\n",
    "# To run privately (e.g., setting the real host via env var):\n",
    "#   import os\n",
    "#   base_url_template_sonic = os.getenv(\"SONIC_URL_TEMPLATE\", \"https://example.org/Sonic_Anemometer/Y{year}/M{month}/D{day}\")\n",
    "#   download_sonic_anemometer_files_for_month(\n",
    "#       base_url_template_sonic, start_date_sonic, end_date_sonic, base_dir_sonic\n",
    "#   )\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}