{ "cells": [ { "cell_type": "markdown", "id": "3528957d", "metadata": {}, "source": [ "# Data Extraction" ] }, { "cell_type": "markdown", "id": "2fe7c5ae-f4f2-4b15-8c53-6c2b38e9e090", "metadata": {}, "source": [ "The following notebook retrieves AIS data from the [UN Global Platform](https://unstats.un.org/wiki/display/AIS/AIS+Handbook+Outline)." ] }, { "cell_type": "markdown", "id": "0d2116c5-6dd4-468a-a0d9-883a8b5df980", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "c33c3961", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import geopandas as gpd\n", "from shapely.geometry import mapping\n", "\n", "import json\n", "\n", "import pyspark.sql.functions as F\n", "from ais import functions as af\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "id": "99cebdb6", "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import (\n", " InteractiveShell,\n", ") # allow multiple outputs in one jupyter cell\n", "\n", "InteractiveShell.ast_node_interactivity = \"all\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "8477a219", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "

SparkSession - in-memory

\n", " \n", "
\n", "

SparkContext

\n", "\n", "

Spark UI

\n", "\n", "
\n", "
Version
\n", "
v3.5.0
\n", "
Master
\n", "
k8s://https://10.100.0.1:443
\n", "
AppName
\n", "
nb-d4ee6998-3c82-444f-931c-18193e6eb5b4-0f658
\n", "
\n", "
\n", " \n", "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spark" ] }, { "cell_type": "markdown", "id": "699e18ad", "metadata": {}, "source": [ "## Port Boundaries\n", "Get port boundaries from IMF. " ] }, { "cell_type": "code", "execution_count": 4, "id": "28ff33f0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 1595 entries, 0 to 1594\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 port_boundary 1595 non-null geometry\n", " 1 Port_name 1595 non-null object \n", " 2 Country 1595 non-null object \n", " 3 Continent 1595 non-null object \n", "dtypes: geometry(1), object(3)\n", "memory usage: 62.3+ KB\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
port_boundaryPort_nameCountryContinent
0POLYGON ((-149.93214 61.25829, -149.83906 61.2...AnchorageU.S.A.North-America
1POLYGON ((-148.68875 60.78098, -148.65289 60.7...WhittierU.S.A.North-America
2POLYGON ((-146.42760 61.11690, -146.38752 61.1...SwanportU.S.A.North-America
3POLYGON ((-130.38185 54.34481, -130.24392 54.3...Prince RupertU.S.A.North-America
4POLYGON ((-123.13286 49.26700, -123.13218 49.3...VancouverCanadaNorth-America
\n", "
" ], "text/plain": [ " port_boundary Port_name Country \\\n", "0 POLYGON ((-149.93214 61.25829, -149.83906 61.2... Anchorage U.S.A. \n", "1 POLYGON ((-148.68875 60.78098, -148.65289 60.7... Whittier U.S.A. \n", "2 POLYGON ((-146.42760 61.11690, -146.38752 61.1... Swanport U.S.A. \n", "3 POLYGON ((-130.38185 54.34481, -130.24392 54.3... Prince Rupert U.S.A. \n", "4 POLYGON ((-123.13286 49.26700, -123.13218 49.3... Vancouver Canada \n", "\n", " Continent \n", "0 North-America \n", "1 North-America \n", "2 North-America \n", "3 North-America \n", "4 North-America " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "token = \"\"\n", "project_id = 358\n", "file_path = \"data/AIS_Jasper/port_boundaries.geojson\"\n", "\n", "string = af.get_file_gitlab(\n", " token,\n", " project_id,\n", " file_path,\n", " org_path=\"https://code.officialstatistics.org\",\n", " branch=\"main\",\n", " csv_df=False,\n", ")\n", "\n", "imf = (\n", " gpd.GeoDataFrame.from_features(json.loads(string))\n", " .drop_duplicates()\n", " .rename(columns={\"geometry\": \"port_boundary\"})\n", " .set_geometry(\"port_boundary\")\n", ")\n", "imf.info()\n", "imf.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "b825df69", "metadata": {}, "outputs": [], "source": [ "imf = imf.drop_duplicates(subset=[\"Country\", \"Port_name\"], keep=\"first\")" ] }, { "cell_type": "markdown", "id": "b42af715", "metadata": {}, "source": [ "## Add Chokepoints" ] }, { "cell_type": "code", "execution_count": 7, "id": "aea503b8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1593" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(imf)" ] }, { "cell_type": "code", "execution_count": 8, "id": "06150e32", "metadata": {}, "outputs": [], "source": [ "chokepoints_str = '{\"type\":\"FeatureCollection\",\"features\":[{\"type\":\"Feature\",\"properties\":{\"Port_name\":\"Bab el-Mandeb Strait\",\"Country\":\"chokepoint\"},\"geometry\":{\"type\":\"Polygon\",\"coordinates\":[[[43.309777,12.493873],[43.331355,12.474386],[43.514229,12.664987],[43.479704,12.669198],[43.479704,12.669198],[43.309777,12.493873]]]}},{\"type\":\"Feature\",\"properties\":{\"Port_name\":\"Suez Canal\",\"Country\":\"chokepoint\"},\"geometry\":{\"type\":\"Polygon\",\"coordinates\":[[[32.553653,29.932182],[32.560088,29.926227],[32.583646,29.944185],[32.585828,29.955619],[32.580702,29.956848],[32.570668,29.942484],[32.570668,29.942484],[32.553653,29.932182]]]}},{\"type\":\"Feature\",\"properties\":{\"Port_name\":\"Cape of Good Hope\",\"Country\":\"chokepoint\"},\"geometry\":{\"type\":\"Polygon\",\"coordinates\":[[[19.877499,-34.845363],[19.910717,-37.614427],[19.9628,-37.616709],[19.944734,-36.60437],[19.926568,-34.850397],[19.926568,-34.850397],[19.877499,-34.845363]]]}}]}'\n", "chokepoints_gdf = (\n", " gpd.GeoDataFrame.from_features(json.loads(chokepoints_str))\n", " .rename(columns={\"geometry\": \"port_boundary\"})\n", " .set_geometry(\"port_boundary\")\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "id": "b9f6f0d2", "metadata": {}, "outputs": [], "source": [ "imf = pd.concat([imf, chokepoints_gdf])" ] }, { "cell_type": "code", "execution_count": 10, "id": "e4ff3829", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1596" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "('U.S.A.:Anchorage',\n", " {'type': 'Polygon',\n", " 'coordinates': (((-149.9321447362291, 61.258287735699795),\n", " (-149.8390591326616, 61.2586723869542),\n", " (-149.84713680900424, 61.20020539628371),\n", " (-149.93829915629968, 61.19828214001165),\n", " (-149.9321447362291, 61.258287735699795)),)})" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query_polys = [\n", " (\n", " f\"{imf.Country.iloc[i]}:{imf.Port_name.iloc[i]}\",\n", " mapping(imf.port_boundary.iloc[i]),\n", " )\n", " for i in range(imf.shape[0])\n", "]\n", "len(query_polys)\n", "\n", "query_polys[0]" ] }, { "cell_type": "markdown", "id": "77eb3fb7", "metadata": {}, "source": [ "Use port boundaries to identify hexabins to query in the AIS data. We choose resolution 10 because some polygons are too small to be fitted a lower resolution." ] }, { "cell_type": "code", "execution_count": 11, "id": "8d448959", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 6415967 entries, 0 to 6415966\n", "Data columns (total 3 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 hex_id int64 \n", " 1 polygon_name object\n", " 2 hex_resolution int64 \n", "dtypes: int64(2), object(1)\n", "memory usage: 146.8+ MB\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING: there are hex ids contained in more than 1 polygon\n" ] } ], "source": [ "imf_df_hex = af.polygon_to_hex_df(query_polys, hex_resolution=10, overfill=False)\n", "imf_df_hex.info()" ] }, { "cell_type": "code", "execution_count": 19, "id": "6a228718", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hex_idpolygon_namehex_resolution
6415962624811593067102207chokepoint:Cape of Good Hope10
6415963625181089372012543chokepoint:Cape of Good Hope10
6415964625181119436783615chokepoint:Cape of Good Hope10
6415965625180943611559935chokepoint:Cape of Good Hope10
6415966625180973944766463chokepoint:Cape of Good Hope10
\n", "
" ], "text/plain": [ " hex_id polygon_name hex_resolution\n", "6415962 624811593067102207 chokepoint:Cape of Good Hope 10\n", "6415963 625181089372012543 chokepoint:Cape of Good Hope 10\n", "6415964 625181119436783615 chokepoint:Cape of Good Hope 10\n", "6415965 625180943611559935 chokepoint:Cape of Good Hope 10\n", "6415966 625180973944766463 chokepoint:Cape of Good Hope 10" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imf_df_hex.tail()" ] }, { "cell_type": "code", "execution_count": 21, "id": "b78be279", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6415946" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(imf_df_hex)" ] }, { "cell_type": "code", "execution_count": 15, "id": "a8ed6d24", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "21" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imf_df_hex.hex_id.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 16, "id": "71f5313b", "metadata": {}, "outputs": [], "source": [ "imf_df_hex = imf_df_hex[~(imf_df_hex.hex_id.duplicated())]" ] }, { "cell_type": "code", "execution_count": 22, "id": "1e4812e6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imf_df_hex.hex_id.duplicated().sum()" ] }, { "cell_type": "markdown", "id": "672388ef", "metadata": {}, "source": [ "## AIS Data" ] }, { "cell_type": "code", "execution_count": 24, "id": "a1158c10", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hex_idpolygon_namehex_resolution
6415965625180943611559935chokepoint:Cape of Good Hope10
6415966625180973944766463chokepoint:Cape of Good Hope10
\n", "
" ], "text/plain": [ " hex_id polygon_name hex_resolution\n", "6415965 625180943611559935 chokepoint:Cape of Good Hope 10\n", "6415966 625180973944766463 chokepoint:Cape of Good Hope 10" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imf_df_hex.tail(2)" ] }, { "cell_type": "code", "execution_count": 34, "id": "295d57bd-8ecd-4a68-9b4d-3650faab635d", "metadata": {}, "outputs": [], "source": [ "start_dates = pd.date_range(\"2023-01-01\", \"2024-03-31\", freq=\"MS\")\n", "end_dates = pd.date_range(\"2023-01-01\", \"2024-03-31\", freq=\"M\")\n", "\n", "keep_cols = [\n", " \"mmsi\",\n", " \"dt_insert_utc\",\n", " \"longitude\",\n", " \"latitude\",\n", " \"imo\",\n", " \"vessel_name\",\n", " \"vessel_type\",\n", " \"vessel_type_cargo\",\n", " \"vessel_class\",\n", " \"length\",\n", " \"width\",\n", " \"flag_country\",\n", " \"destination\",\n", " \"draught\",\n", " \"sog\",\n", " \"cog\",\n", " \"rot\",\n", " \"heading\",\n", " \"nav_status\",\n", " \"dt_pos_utc\",\n", " \"dt_static_utc\",\n", " \"vessel_type_main\",\n", " \"vessel_type_sub\",\n", "]\n", "group_by_cols = [\"mmsi\", \"imo\", \"vessel_name\", \"route_group\"]\n", "\n", "ACCESS_KEY = \"\"\n", "SECRET_KEY = \"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "c2eefbae-1ec6-4218-9d91-4ab5d7bd5621", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/6 [00:00