From 19a8f02eb3929d20e68bf54b0963dc79cc3231e8 Mon Sep 17 00:00:00 2001
From: Kerem Keptig <kerem.keptig@stud-mail.uni-wuerzburg.de>
Date: Tue, 7 Jan 2025 08:12:36 +0100
Subject: [PATCH] Upload New File

---
 temporal-bipartite.ipynb | 428 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 428 insertions(+)
 create mode 100644 temporal-bipartite.ipynb

diff --git a/temporal-bipartite.ipynb b/temporal-bipartite.ipynb
new file mode 100644
index 0000000..4e50728
--- /dev/null
+++ b/temporal-bipartite.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathpyG as pp\n",
+    "from pathpyG.algorithms.components import connected_components\n",
+    "from pathpyG.algorithms.temporal import lift_order_temporal\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import torch\n",
+    "import networkx as nx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def preprocess(dataset_name: str):\n",
+    "    \"\"\"\n",
+    "    Read the original data file and return the DataFrame with columns ['u', 'i', 'ts', 'label', 'idx']\n",
+    "    \"\"\"\n",
+    "    u_list, i_list, ts_list, label_list = [], [], [], []\n",
+    "    feat_l = []\n",
+    "    idx_list = []\n",
+    "\n",
+    "    with open(dataset_name) as f:\n",
+    "        next(f)\n",
+    "        previous_time = -1\n",
+    "        for idx, line in enumerate(f):\n",
+    "            e = line.strip().split(',')\n",
+    "            u = int(e[0])\n",
+    "            i = int(e[1])\n",
+    "            ts = float(e[2])\n",
+    "            assert ts >= previous_time\n",
+    "            previous_time = ts\n",
+    "            label = float(e[3])\n",
+    "            feat = np.array([float(x) for x in e[4:]])\n",
+    "            u_list.append(u)\n",
+    "            i_list.append(i)\n",
+    "            ts_list.append(ts)\n",
+    "            label_list.append(label)\n",
+    "            idx_list.append(idx)\n",
+    "            feat_l.append(feat)\n",
+    "\n",
+    "    return pd.DataFrame({\n",
+    "        'u': u_list,\n",
+    "        'i': i_list,\n",
+    "        'ts': ts_list,\n",
+    "        'label': label_list,\n",
+    "        'idx': idx_list\n",
+    "    }), np.array(feat_l)\n",
+    "\n",
+    "\n",
+    "def reindex(df: pd.DataFrame, bipartite: bool = True):\n",
+    "    new_df = df.copy()\n",
+    "    if bipartite:\n",
+    "        upper_u = df.u.max() + 1\n",
+    "        new_i = df.i + upper_u\n",
+    "        new_df.i = new_i\n",
+    "    new_df.u += 1\n",
+    "    new_df.i += 1\n",
+    "    new_df.idx += 1\n",
+    "    return new_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "class BipartiteTemporalPercolation:\n",
+    "    def __init__(self, temporal_graph):\n",
+    "        self.temporal_graph = temporal_graph\n",
+    "\n",
+    "    def create_event_graph(self, delta_t):\n",
+    "        event_edge_index = pp.algorithms.temporal.lift_order_temporal(self.temporal_graph, delta=delta_t)\n",
+    "        event_graph = pp.Graph.from_edge_index(event_edge_index)\n",
+    "\n",
+    "        if event_edge_index is None or len(event_edge_index) == 0:\n",
+    "            print(f\"[DEBUG] No edges in event_edge_index for δ={delta_t}\")\n",
+    "        else:\n",
+    "            print(f\"[DEBUG] Number of edges in event_edge_index for δ={delta_t}: {len(event_edge_index)}\")\n",
+    "\n",
+    "        return event_graph\n",
+    "\n",
+    "    def percolation_analysis(self, delta_t_values):\n",
+    "        \"\"\"\n",
+    "        Perform temporal percolation analysis over a range of delta_t values.\n",
+    "        \"\"\"\n",
+    "        largest_components = {}\n",
+    "\n",
+    "        for delta_t in delta_t_values:\n",
+    "            print(f\"Processing δ={delta_t}...\")\n",
+    "            try:\n",
+    "                event_graph = self.create_event_graph(delta_t)\n",
+    "                num_components, labels = pp.algorithms.components.connected_components(event_graph)\n",
+    "\n",
+    "                print(f\"Number of components: {num_components}\")\n",
+    "                print(f\"Labels: {labels}\")\n",
+    "\n",
+    "                uniques, component_sizes = torch.unique(torch.tensor(labels), return_counts=True)\n",
+    "                print(component_sizes)\n",
+    "                print(uniques)\n",
+    "\n",
+    "                if component_sizes.numel() == 0:  # Check if component_sizes is empty\n",
+    "                    print(f\"No connected components found for δ={delta_t}.\")\n",
+    "                    largest_component_size = 0\n",
+    "                    total_components = 0\n",
+    "                    average_component_size = 0\n",
+    "                else:\n",
+    "                    largest_component_size = component_sizes.max().item()\n",
+    "                    total_components = len(component_sizes)\n",
+    "                    average_component_size = component_sizes.float().mean().item()\n",
+    "\n",
+    "                largest_components[delta_t] = {\n",
+    "                    \"largest_component_size\": largest_component_size,\n",
+    "                    \"total_components\": total_components,\n",
+    "                    \"average_component_size\": average_component_size,\n",
+    "                }\n",
+    "\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error at δ={delta_t}: {e}\")\n",
+    "                largest_components[delta_t] = {\n",
+    "                    \"largest_component_size\": 0,\n",
+    "                    \"total_components\": 0,\n",
+    "                    \"average_component_size\": 0,\n",
+    "                }\n",
+    "\n",
+    "        return largest_components\n",
+    "\n",
+    "    def find_critical_threshold(self, results):\n",
+    "        \"\"\"\n",
+    "        Identify the critical delta_t where the largest connected component (LCC) rapidly grows.\n",
+    "        \"\"\"\n",
+    "        # Extract the \"largest_component_size\" from the results\n",
+    "        largest_component_sizes = [result[\"largest_component_size\"] for result in results.values()]\n",
+    "        delta_t_values = list(results.keys())\n",
+    "\n",
+    "        # Ensure there are enough data points\n",
+    "        if len(largest_component_sizes) < 2:\n",
+    "            raise ValueError(\"Not enough data points to determine the critical threshold.\")\n",
+    "\n",
+    "        # Calculate the relative growth of LCC size\n",
+    "        lcc_differences = np.diff(largest_component_sizes) / np.array(largest_component_sizes[:-1])\n",
+    "\n",
+    "        # Find the index of the largest relative growth\n",
+    "        critical_index = np.argmax(lcc_differences)\n",
+    "\n",
+    "        # Return the delta_t corresponding to the largest jump\n",
+    "        return delta_t_values[critical_index + 1]\n",
+    "\n",
+    "    def compute_shortest_path_lengths(self, delta_t):\n",
+    "        \"\"\"\n",
+    "        Compute the average shortest path length for a specific delta_t.\n",
+    "        \"\"\"\n",
+    "        dist_matrix, _ = pp.algorithms.temporal.temporal_shortest_paths(self.temporal_graph, delta=delta_t)\n",
+    "        dist_matrix[dist_matrix == float('inf')] = np.nan  # Replace infinity with NaN for unreachable nodes\n",
+    "        avg_path_length = np.nanmean(dist_matrix)\n",
+    "        return avg_path_length\n",
+    "\n",
+    "    def compute_clustering_coefficient(self, delta_t):\n",
+    "        \"\"\"\n",
+    "        Compute the clustering coefficient for a specific delta_t.\n",
+    "        \"\"\"\n",
+    "        # Use the temporal graph directly and convert it to a static graph\n",
+    "        static_graph = self.temporal_graph.to_static_graph(weighted=False)\n",
+    "\n",
+    "        # Convert static graph to NetworkX format\n",
+    "        edge_index = static_graph.data.edge_index.cpu().detach().numpy()\n",
+    "        nx_graph = nx.Graph()\n",
+    "        nx_graph.add_edges_from(edge_index.T)\n",
+    "\n",
+    "        # Compute the clustering coefficient\n",
+    "        clustering_coefficient = nx.average_clustering(nx_graph)\n",
+    "        return clustering_coefficient\n",
+    "\n",
+    "    def plot_percolation_results(self, results):\n",
+    "        \"\"\"\n",
+    "        Plot the largest component size as a function of delta_t.\n",
+    "        \"\"\"\n",
+    "        delta_t_values = list(results.keys())\n",
+    "        largest_component_sizes = list(results.values())\n",
+    "\n",
+    "        plt.figure(figsize=(10, 6))\n",
+    "        plt.plot(delta_t_values, largest_component_sizes, marker='o', linestyle='-', color='b')\n",
+    "        plt.xlabel(\"δt (time threshold)\")\n",
+    "        plt.ylabel(\"Largest Component Size\")\n",
+    "        plt.title(\"Temporal Percolation Analysis: Largest Component vs δt\")\n",
+    "        plt.grid(True)\n",
+    "        plt.show()\n",
+    "\n",
+    "    def plot_degree_distribution(self, delta_t, title):\n",
+    "        static_graph = self.temporal_graph.to_static_graph(weighted=False)\n",
+    "        edge_index = static_graph.data.edge_index.cpu().detach().numpy()\n",
+    "        nx_graph = nx.Graph()\n",
+    "        nx_graph.add_edges_from(edge_index.T)\n",
+    "        if not nx.is_connected(nx_graph):\n",
+    "            largest_cc = max(nx.connected_components(nx_graph), key=len)\n",
+    "            nx_graph = nx_graph.subgraph(largest_cc).copy()\n",
+    "        if not nx.algorithms.bipartite.is_bipartite(nx_graph):\n",
+    "            raise ValueError(\"The graph is not bipartite.\")\n",
+    "        node_sets = nx.algorithms.bipartite.sets(nx_graph)\n",
+    "        set_a, set_b = node_sets\n",
+    "        degrees_set_a = [nx_graph.degree(node) for node in set_a]\n",
+    "        degrees_set_b = [nx_graph.degree(node) for node in set_b]\n",
+    "        plt.figure(figsize=(8, 6))\n",
+    "        plt.hist(degrees_set_a, bins=20, alpha=0.7, label=\"Set A Degrees\")\n",
+    "        plt.hist(degrees_set_b, bins=20, alpha=0.7, label=\"Set B Degrees\")\n",
+    "        plt.xlabel(\"Degree\")\n",
+    "        plt.ylabel(\"Frequency\")\n",
+    "        plt.title(title)\n",
+    "        plt.legend()\n",
+    "        plt.grid(True)\n",
+    "        plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of duplicates: 0\n",
+      "Number of temporal edges: 157474\n",
+      "Temporal Graph with 9227 nodes, 18257 unique edges and 157474 events in [0.0, 2678373.0]\n",
+      "{'Edge Attributes': {}, 'Graph Attributes': {'num_nodes': \"<class 'int'>\"}, 'Node Attributes': {}}\n",
+      "Graph density: 0.0018498430756906205\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "'Tensor' object is not callable",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[32], line 32\u001b[0m\n\u001b[0;32m     30\u001b[0m \u001b[38;5;66;03m# Connected Components\u001b[39;00m\n\u001b[0;32m     31\u001b[0m num_components, labels \u001b[38;5;241m=\u001b[39m pp\u001b[38;5;241m.\u001b[39malgorithms\u001b[38;5;241m.\u001b[39mcomponents\u001b[38;5;241m.\u001b[39mconnected_components(temporal_graph)\n\u001b[1;32m---> 32\u001b[0m largest_component_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mmax\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcount\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mset\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNumber of components: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnum_components\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m     34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLargest component size: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlargest_component_size\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[1;31mTypeError\u001b[0m: 'Tensor' object is not callable"
+     ]
+    }
+   ],
+   "source": [
+    "# Main Execution\n",
+    "file_path = \"reddit.csv\"\n",
+    "bipartite = True\n",
+    "\n",
+    "df, edge_feats = preprocess(file_path)\n",
+    "df = reindex(df, bipartite=bipartite)\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Preprocess data\n",
+    "df, edge_feats = preprocess(file_path)\n",
+    "df = reindex(df, bipartite=bipartite)\n",
+    "\n",
+    "# Ensure all columns have consistent data types\n",
+    "df['ts'] = df['ts'].round(0).astype(int)\n",
+    "df['u'] = df['u'].astype(int)\n",
+    "df['i'] = df['i'].astype(int)\n",
+    "\n",
+    "# Optional: Check for duplicates (based on u, i, ts)\n",
+    "duplicates = df[df.duplicated(subset=['u', 'i', 'ts'], keep=False)]\n",
+    "print(f\"Number of duplicates: {len(duplicates)}\")\n",
+    "\n",
+    "# All temporal edges (default approach)\n",
+    "tedges = list(df[['u', 'i', 'ts']].itertuples(index=False, name=None))\n",
+    "temporal_graph = pp.TemporalGraph.from_edge_list(tedges)\n",
+    "print(f\"Graph created with {len(temporal_graph.edges)} unique edges and {len(tedges)} temporal events.\")\n",
+    "\n",
+    "print(temporal_graph)\n",
+    "\n",
+    "print(f\"First 10 temporal edges: {tedges[:10]}\")\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Print temporal graph summary\n",
+    "print(temporal_graph)\n",
+    "\n",
+    "temporal_edges = temporal_graph.edges\n",
+    "print(f\"Number of temporal events: {len(temporal_edges)}\")\n",
+    "print(f\"First 5 temporal edges: {temporal_edges[:50]}\")\n",
+    "\n",
+    "\n",
+    "density = len(temporal_graph.edges) / (len(temporal_graph.nodes) * (len(temporal_graph.nodes) - 1))\n",
+    "print(f\"Graph density: {density}\")\n",
+    "\n",
+    "bipartite_percolation = BipartiteTemporalPercolation(temporal_graph)\n",
+    "\n",
+    "min = temporal_graph['time'].min()\n",
+    "max = temporal_graph['time'].max()\n",
+    "\n",
+    "delta_t_values = np.linspace(10000, max, 4)\n",
+    "analysis_results = bipartite_percolation.percolation_analysis(delta_t_values)\n",
+    "critical_delta_t = float(delta_t_values[np.argmax([result[\"largest_component_size\"] for result in analysis_results.values()])])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 1000x600 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 1000x600 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 1000x600 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Plot the analysis results\n",
+    "delta_t_values = list(analysis_results.keys())\n",
+    "largest_component_sizes = [result[\"largest_component_size\"] for result in analysis_results.values() if \"largest_component_size\" in result]\n",
+    "total_components = [result[\"total_components\"] for result in analysis_results.values() if \"total_components\" in result]\n",
+    "average_component_sizes = [result[\"average_component_size\"] for result in analysis_results.values() if \"average_component_size\" in result]\n",
+    "\n",
+    "# Plot Largest Component Size vs δt\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(delta_t_values, largest_component_sizes, marker='o')\n",
+    "plt.xlabel(\"δt (seconds)\")\n",
+    "plt.ylabel(\"Largest Component Size\")\n",
+    "plt.title(\"Largest Component Size vs δt\")\n",
+    "plt.grid(True)\n",
+    "plt.show()\n",
+    "\n",
+    "# Plot Total Components vs δt\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(delta_t_values, total_components, marker='o', color='r')\n",
+    "plt.xlabel(\"δt (seconds)\")\n",
+    "plt.ylabel(\"Total Components\")\n",
+    "plt.title(\"Total Components vs δt\")\n",
+    "plt.grid(True)\n",
+    "plt.show()\n",
+    "\n",
+    "# Plot Average Component Size vs δt\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(delta_t_values, average_component_sizes, marker='o', color='g')\n",
+    "plt.xlabel(\"δt (seconds)\")\n",
+    "plt.ylabel(\"Average Component Size\")\n",
+    "plt.title(\"Average Component Size vs δt\")\n",
+    "plt.grid(True)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Clustering Coefficient & Shortest Path Length\n",
+    "clustering_pre = bipartite_percolation.compute_clustering_coefficient(0.5 * critical_delta_t)\n",
+    "clustering_at = bipartite_percolation.compute_clustering_coefficient(critical_delta_t)\n",
+    "clustering_post = bipartite_percolation.compute_clustering_coefficient(1.5 * critical_delta_t)\n",
+    "\n",
+    "avg_path_length_pre = bipartite_percolation.compute_shortest_path_lengths(0.5 * critical_delta_t)\n",
+    "avg_path_length_at = bipartite_percolation.compute_shortest_path_lengths(critical_delta_t)\n",
+    "avg_path_length_post = bipartite_percolation.compute_shortest_path_lengths(1.5 * critical_delta_t)\n",
+    "\n",
+    "print(f\"Clustering Coefficient (δt < δt_c): {clustering_pre}\")\n",
+    "print(f\"Clustering Coefficient (δt = δt_c): {clustering_at}\")\n",
+    "print(f\"Clustering Coefficient (δt > δt_c): {clustering_post}\")\n",
+    "\n",
+    "print(f\"Average Path Length (δt < δt_c): {avg_path_length_pre}\")\n",
+    "print(f\"Average Path Length (δt = δt_c): {avg_path_length_at}\")\n",
+    "print(f\"Average Path Length (δt > δt_c): {avg_path_length_post}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myDefault",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
GitLab