Upload New File

54f1e7d5 · Moritz Schüler · ae42ce86 · 54f1e7d5
Commit 54f1e7d5 authored 1 month ago by Moritz Schüler
--- a/Moritz Schüler/interevent_time.py
+++ b/Moritz Schüler/interevent_time.py
+import numpy as np
+import pathpyG as pp
+import matplotlib.pyplot as plt
+from scipy.stats import poisson, zipf, norm, lognorm, kstest, chisquare
+from tqdm import tqdm
+import random 
+
+
+def mine_interevent_times(graph: pp.TemporalGraph, time_aggregation_list: list = [1,1,1]) -> dict:
+    interevent_time_dict = {}
+    edge_interevent_time_dict = {}
+    node_interevent_time_dict = {}
+    interevent_time_dict['global'] = global_interevent_time(graph, time_aggregation_list[0])
+
+    edge_intereventtime = edge_interevent_time(graph, time_aggregation_list[1])
+    edge_interevent_time_dict['aggregate'] = edge_intereventtime[1]
+    edge_interevent_time_dict['individual'] = edge_intereventtime[0]
+    interevent_time_dict['edge'] = edge_interevent_time_dict
+
+    node_intereventtime = node_interevent_time(graph, time_aggregation_list[2])
+    node_interevent_time_dict['aggregate'] = node_intereventtime[1]
+    node_interevent_time_dict['individual'] = node_intereventtime[0]
+    interevent_time_dict['node'] = node_interevent_time_dict
+
+    return interevent_time_dict
+
+def ks_test(data, dist):
+    if dist == 'lognorm':
+        data = np.array(data)
+        data = data + 1e-9
+        shape, loc, scale = lognorm.fit(data, floc=0)
+        ks_stat, ks_p_value = kstest(data, 'lognorm', args=(shape, loc, scale))
+        print('--------------------')
+        print('KS test lognorm')
+        print(f"KS Statistic: {ks_stat}")
+        print(f"P-Value: {ks_p_value}")
+
+    elif dist == 'norm':
+        mu, sigma = norm.fit(data)
+        ks_stat, ks_p_value = kstest(data, 'norm', args=(mu, sigma))
+        print('--------------------')
+        print('KS test norm')
+        print(f"KS Statistic: {ks_stat}")
+        print(f"P-Value: {ks_p_value}")
+    
+    elif dist == 'poisson':
+        lambda_est = np.mean(data)
+        poisson_dist = poisson(mu=lambda_est)
+        ks_stat, ks_p_value = kstest(data, poisson_dist.cdf)
+        print('--------------------')
+        print('KS test poisson')
+        print(f"KS Statistic: {ks_stat}")
+        print(f"P-Value: {ks_p_value}")
+    
+    elif dist == 'zipf':
+        alpha_est = 2  # Example; use MLE or other methods to estimate
+        zipf_dist = zipf(a=alpha_est)
+        ks_stat, ks_p_value = kstest(data, zipf_dist.cdf)
+        print('--------------------')
+        print('KS test zipf')
+        print(f"KS Statistic: {ks_stat}")
+        print(f"P-Value: {ks_p_value}")
+
+def chisquare_test(data, dist):
+    match dist:
+        case 'poisson':
+            unique, observed_frequencies = np.unique(data, return_counts=True)
+            lambda_poisson = np.mean(data)
+            expected_frequencies_poisson = len(data) * poisson.pmf(unique, lambda_poisson)
+            chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_poisson, sum_check=False, ddof=1)
+            print('--------------------')
+            print('chisquare test poisson')
+            print(f"chisquare Statistic: {chi2_stat}")
+            print(f"P-Value: {chi2_p_value}")
+        case 'zipf':
+            unique, observed_frequencies = np.unique(data, return_counts=True)
+            s_zipf = 1.1
+            expected_frequencies_zipf = len(data) * zipf.pmf(unique, s_zipf)
+            chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_zipf, sum_check=False, ddof=1)
+            print('--------------------')
+            print('chisquare test zipf')
+            print(f"chisquare Statistic: {chi2_stat}")
+            print(f"P-Value: {chi2_p_value}")
+        case 'norm':
+            unique, observed_frequencies = np.unique(data, return_counts=True)
+            mean_normal = np.mean(data)
+            std_normal = np.std(data, ddof=1)
+            expected_frequencies_normal = len(data) * norm.pdf(unique, mean_normal, std_normal)
+            chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_normal, sum_check=False, ddof=2)
+            print('--------------------')
+            print('chisquare test normal')
+            print(f"chisquare Statistic: {chi2_stat}")
+            print(f"P-Value: {chi2_p_value}")
+        case 'lognorm':
+            unique, observed_frequencies = np.unique(data, return_counts=True)
+            shape_lognormal, loc_lognormal, scale_lognormal = lognorm.fit(data, floc=0)  # Fit Lognormal parameters
+            expected_frequencies_lognormal = len(data) * lognorm.pdf(unique, shape_lognormal, loc_lognormal, scale_lognormal)
+            chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_lognormal, sum_check=False, ddof=2)
+            print('--------------------')
+            print('chisquare test lognormal')
+            print(f"chisquare Statistic: {chi2_stat}")
+            print(f"P-Value: {chi2_p_value}")
+
+
+def fourier(graph: pp.TemporalGraph, time_aggregation):
+    time_data = graph.data.time.tolist()
+    time_data = np.unique(time_data)
+    time_data = time_data/time_aggregation
+    time_data = np.array([int(t) for t in time_data])
+    hist, _ = np.histogram(time_data, np.arange(min(time_data), max(time_data) + 1))
+
+    fft_result = np.fft.fft(hist)
+    frequencies = np.fft.fftfreq(len(hist), d=1) 
+
+    plt.plot(frequencies[:len(frequencies)//2], np.abs(fft_result)[:len(frequencies)//2])
+    plt.axvline(time_aggregation/(60*60*24), label = 'daily pattern', color = 'r', alpha = 0.4)
+    plt.axvline(time_aggregation/(60*60*24*7), label = 'weekly pattern', color = 'g', alpha = 0.4)
+    plt.xlabel(f'Frequency in 1/{time_aggregation}s')
+    plt.ylabel('Amplitude')
+    # plt.title('Frequency Spectrum')
+    plt.legend()
+    plt.show()
+
+def plot_event_time_histogram(graph: pp.TemporalGraph):
+    time_aggregation = 60*60*24
+    time_data = graph.data.time.tolist()
+    time_data = np.unique(time_data) 
+    time_data = time_data/time_aggregation
+    time_data = np.array([int(t) for t in time_data])
+    values, counts = np.unique(time_data, return_counts=True)
+    plt.figure(figsize=(25,6))
+    plt.bar(values, counts, 1)
+    time_range = int((time_data[-1] - time_data[0])/365)
+    plt.xticks([time_data[0] + i*365 for i in range(time_range+1)], range(time_range+1))
+    plt.xlabel('years since start')
+    plt.ylabel('counts')
+    # plt.title('events aggregated to days since start of the project')
+    plt.show()
+
+    values, counts = np.unique(time_data, return_counts=True)
+    plt.figure(figsize=(25,6))
+    plt.bar(values, counts, 1)
+    time_range = int((time_data[-1] - time_data[0]))
+    plt.xticks([time_data[0] + i for i in range(time_range+1) if i%7 ==0], range(0, time_range+1, 7), rotation = 90)
+    plt.xlim(min(values), min(values) + 365)
+    plt.xlabel('days since start')
+    plt.ylabel('counts')
+    # plt.title('events aggregated to days since start of the project')
+    plt.show()
+
+
+    
+def plot_interevent_time_histograms(global_iets, edge_iets, node_iets):
+    plt.figure(figsize=(25,10))
+    values, counts = np.unique(global_iets, return_counts=True)
+    plt.bar(values, counts, 4)
+    # plt.title('global inter-event times')
+    plt.ylabel('count')
+    plt.xlabel('interevent time in hours')
+    
+    plt.show()
+    plt.figure(figsize=(25,10))
+    values, counts = np.unique(edge_iets, return_counts=True)
+    plt.bar(values, counts, 2)
+    # plt.title('edge interevent times')
+    plt.ylabel('count')
+    plt.xlabel('interevent time in days')
+    plt.show()
+    plt.figure(figsize=(25,10))
+    values, counts = np.unique(node_iets, return_counts=True)
+    plt.bar(values, counts, 2)
+    # plt.title('node interevent times')
+    plt.ylabel('count')
+    plt.xlabel('interevent time in days')
+    plt.show()
+
+def plot_distribution_comparison(interevent_times: list, level):
+    avg_interevent_time = np.mean(interevent_times)
+    std_interevent_time = np.std(interevent_times)
+
+    values, counts = np.unique(interevent_times, return_counts=True)
+    total_count = np.sum(counts)
+
+    k_values = np.arange(0, max(interevent_times))
+    x_values = np.linspace(0, max(interevent_times), 20000)
+    
+    #poisson
+    poisson_pmf = poisson.pmf(k_values, mu=avg_interevent_time)
+    poisson_pmf = poisson_pmf * total_count
+    #zipf
+    zipf_param = 1.1
+    zipf_pmf = zipf.pmf(k_values, a=zipf_param)
+    zipf_pmf = zipf_pmf * total_count
+    #normal
+    normal_pdf = norm.pdf(x_values, loc=avg_interevent_time, scale=std_interevent_time)
+    normal_pdf = normal_pdf * total_count
+    #log-normal
+    interevent_times = np.array(interevent_times)
+    interevent_times = interevent_times + 1e-9
+    shape, loc, scale = lognorm.fit(interevent_times, floc = 0)
+    lognormal_pdf = lognorm.pdf(x_values, s=shape, loc=loc, scale=scale)
+    lognormal_pdf = lognormal_pdf * total_count
+
+    plt.figure(figsize=(12, 8))
+    plt.plot(x_values, lognormal_pdf, 'c-', label='Log-Normal PDF', alpha = 1)
+    plt.plot(x_values, normal_pdf, 'm-', label=f'Normal PDF (μ={avg_interevent_time:.2f}, σ={std_interevent_time:.2f})', alpha = 1)
+    plt.plot(k_values, zipf_pmf, 'g-', label=f'Zipf PMF (a={zipf_param:.2f})', alpha = 1)
+    plt.plot(k_values, poisson_pmf, 'r-', label=f'Poisson PMF (λ={avg_interevent_time:.2f})', alpha = 1)
+    # plt.hist(interevent_times, density=True, color='blue', alpha=0.6, label='Interevent Times')
+    if level == 'global':
+        plt.xlabel('interevent time in hours')
+    else:
+        plt.xlabel('interevent time in days')
+    plt.ylabel('count')
+    
+    plt.bar(values, counts, 2, label='Interevent Times', alpha = 0.5)
+    # plt.title(f'Comparison of Interevent Time Distribution with various theoretical distributions')
+    plt.legend()
+    plt.xlim(0, 50)
+    plt.ylim(0, max(counts))
+    plt.grid(axis='y', linestyle='--', alpha=0.7)
+    plt.show()
+
+def distribution_comparison_and_hypothesis_testing(interevent_time_dict, interevent_time_dict_raw):
+    plot_distribution_comparison(interevent_time_dict['global'], 'global')
+    chisquare_test(interevent_time_dict_raw['global'], 'norm')
+    chisquare_test(interevent_time_dict_raw['global'], 'lognorm')
+    chisquare_test(interevent_time_dict_raw['global'], 'poisson')
+    chisquare_test(interevent_time_dict_raw['global'], 'zipf')
+    ks_test(interevent_time_dict_raw['global'], 'norm')
+    ks_test(interevent_time_dict_raw['global'], 'lognorm')
+
+    plot_distribution_comparison(interevent_time_dict['edge']['aggregate'], 'edge')
+    chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'norm')
+    chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'lognorm')
+    chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'poisson')
+    chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'zipf')
+    ks_test(interevent_time_dict_raw['edge']['aggregate'], 'norm')
+    ks_test(interevent_time_dict_raw['edge']['aggregate'], 'lognorm')
+    
+    plot_distribution_comparison(interevent_time_dict['node']['aggregate'], 'node')
+    chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'norm')
+    chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'lognorm')
+    chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'poisson')
+    chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'zipf')
+    ks_test(interevent_time_dict_raw['node']['aggregate'], 'norm')
+    ks_test(interevent_time_dict_raw['node']['aggregate'], 'lognorm')
+    
+
+
+
+
+def global_interevent_time(graph: pp.TemporalGraph, time_aggregation: int = 1, ignore_zero_it = True) -> list:
+    time_data = graph.data.time
+    interevent_times = time_data[1:] - time_data[:-1]
+    if ignore_zero_it:
+        interevent_times = interevent_times[interevent_times > 0]
+
+    interevent_times = interevent_times/time_aggregation
+    interevent_times = np.array([int(t) for t in interevent_times])
+
+    interevent_times = [t.item() for t in interevent_times]
+
+    return interevent_times
+
+def edge_interevent_time(graph: pp.TemporalGraph, time_aggregation: int = 1) -> tuple[dict, list]:
+    sender_nodes = list(graph.data.edge_index[0])
+    sender_nodes = [node.item() for node in sender_nodes]
+    receiver_nodes = list(graph.data.edge_index[1])
+    receiver_nodes = [node.item() for node in receiver_nodes]
+
+    edges = list(zip(sender_nodes, receiver_nodes))
+    time_data = graph.data.time
+    time_data = list(time_data)
+    interevent_time_dict = {edge: [] for edge in set(edges)}
+    time_stamp_dict = {edge: [] for edge in set(edges)}
+    sum_edge_interevent_times = []
+
+    for i in range(len(time_data)):
+        edge_time_stamp = time_data[i].item()
+        time_stamp_dict[edges[i]].append(edge_time_stamp)
+
+    for key in time_stamp_dict.keys():
+        edge_time_stamps = time_stamp_dict[key]
+        if len(edge_time_stamps) > 1:
+            edge_interevent_times = (np.array(edge_time_stamps[1:]) - np.array(edge_time_stamps[:-1]))
+            edge_interevent_times = edge_interevent_times[edge_interevent_times > 0]
+            edge_interevent_times = edge_interevent_times/time_aggregation
+            edge_interevent_times = np.array([int(t) for t in edge_interevent_times])
+            edge_interevent_times = [t.item() for t in edge_interevent_times]
+            if len(edge_interevent_times) > 0:  
+                interevent_time_dict[key] = edge_interevent_times
+                sum_edge_interevent_times += edge_interevent_times
+
+    
+
+
+    return interevent_time_dict, sum_edge_interevent_times
+
+def node_interevent_time(graph: pp.TemporalGraph, time_aggregation: int = 1) -> tuple[dict, list]:
+    sender_nodes = list(graph.data.edge_index[0])
+    sender_nodes = [node.item() for node in sender_nodes]
+    receiver_nodes = list(graph.data.edge_index[1])
+    receiver_nodes = [node.item() for node in receiver_nodes]
+
+    time_data = graph.data.time
+    time_data = list(time_data)
+    nodes = set(sender_nodes + receiver_nodes)
+    interevent_time_dict = {node: [] for node in nodes}
+    time_stamp_dict = {node: [] for node in nodes}
+    sum_node_interevent_times = []
+
+    for i in range(len(time_data)):
+        node_time_stamp = time_data[i].item()
+        time_stamp_dict[sender_nodes[i]].append(node_time_stamp)
+        time_stamp_dict[receiver_nodes[i]].append(node_time_stamp)
+
+    for key in time_stamp_dict.keys():
+        node_time_stamps = time_stamp_dict[key]
+        if len(node_time_stamps) > 1:
+            node_interevent_times = (np.array(node_time_stamps[1:]) - np.array(node_time_stamps[:-1]))
+            node_interevent_times = node_interevent_times[node_interevent_times > 0]
+            node_interevent_times = node_interevent_times/time_aggregation
+            node_interevent_times = np.array([int(t) for t in node_interevent_times])
+            node_interevent_times = [t.item() for t in node_interevent_times]
+            if len(node_interevent_times) > 0:  
+                interevent_time_dict[key] = node_interevent_times
+                sum_node_interevent_times += node_interevent_times
+
+ 
+
+
+    return interevent_time_dict, sum_node_interevent_times
+
+def response_time(graph: pp.TemporalGraph):
+    
+    senders = list(graph.data.edge_index[0])
+    senders = [node.item() for node in senders]
+    receivers = list(graph.data.edge_index[1])
+    receivers = [node.item() for node in receivers]
+    timestamps = graph.data.time
+    
+    response_times_recv_to_send = []
+    developer_rts_response_time_dict = dict()
+    
+    for i, receiver in enumerate(tqdm(receivers)):
+        for j, sender in enumerate(senders[i:]):
+            if receiver == sender:
+                response_time = timestamps[i+j] - timestamps[i]
+                if receiver not in developer_rts_response_time_dict:
+                    developer_rts_response_time_dict[receiver] = [response_time]
+                else: 
+                    developer_rts_response_time_dict[receiver].append(response_time)
+                response_times_recv_to_send.append(response_time)
+                break
+    
+    edges = list(zip(senders, receivers))
+    random.shuffle(edges)
+    shuffled_senders, shuffled_receivers = list(zip(*edges))
+    
+
+    shuffled_response_times_recv_to_send = []
+    shuffled_developer_rts_response_time_dict = dict()
+    
+    for i, receiver in enumerate(tqdm(shuffled_receivers)):
+        for j, sender in enumerate(shuffled_senders[i:]):
+            if receiver == sender:
+                response_time = timestamps[i+j] - timestamps[i]
+                if receiver not in shuffled_developer_rts_response_time_dict:
+                    shuffled_developer_rts_response_time_dict[receiver] = [response_time]
+                else: 
+                    shuffled_developer_rts_response_time_dict[receiver].append(response_time)
+                shuffled_response_times_recv_to_send.append(response_time)
+                break
+
+    response_times_recv_to_send = np.array(response_times_recv_to_send) / 3600
+    shuffled_response_times_recv_to_send = np.array(shuffled_response_times_recv_to_send) / 3600
+
+    print("Node was a receiver, how long until it sends?")
+    print(f"Mean actual response time: {np.mean(response_times_recv_to_send):.2f} hours")
+    print(f"Mean random response time: {np.mean(shuffled_response_times_recv_to_send):.2f} hours")
+
+    # plt.title("Response Time: Receiver to Sender")
+    plt.hist(response_times_recv_to_send, alpha=0.5, density=True, label="Actual Response Time", color="blue")
+    plt.axvline(np.mean(response_times_recv_to_send), alpha=0.7, label = f"Mean Actual Response Time: {np.mean(response_times_recv_to_send):.2f} h", color="blue")
+    plt.hist(shuffled_response_times_recv_to_send, alpha=0.5, density=True, label="Shuffled Response Time", color="red")
+    plt.axvline(np.mean(shuffled_response_times_recv_to_send), alpha=0.7, label = f"Mean Shuffled Response Time: {np.mean(shuffled_response_times_recv_to_send):.2f} h", color="red")
+    plt.xlabel('hours')
+    plt.legend()
+    plt.show()
+
+    response_times_send_to_recv = []
+    developer_str_response_time_dict = dict()
+
+    for i, sender in enumerate(tqdm(senders)):
+        for j, receiver in enumerate(receivers[i:]):
+            if receiver == sender:
+                response_time = timestamps[i+j] - timestamps[i]
+                if receiver not in developer_str_response_time_dict:
+                    developer_str_response_time_dict[receiver] = [response_time]
+                else: 
+                    developer_str_response_time_dict[receiver].append(response_time)
+                response_times_send_to_recv.append(response_time)
+                break
+
+    shuffled_response_times_send_to_recv = []
+    shuffled_developer_str_response_time_dict = dict()
+
+    for i, sender in enumerate(tqdm(shuffled_senders)):
+        for j, receiver in enumerate(shuffled_receivers[i:]):
+            if receiver == sender:
+                response_time = timestamps[i+j] - timestamps[i]
+                if receiver not in shuffled_developer_str_response_time_dict:
+                    shuffled_developer_str_response_time_dict[receiver] = [response_time]
+                else: 
+                    shuffled_developer_str_response_time_dict[receiver].append(response_time)
+                shuffled_response_times_send_to_recv.append(response_time)
+                break
+
+    response_times_send_to_recv = np.array(response_times_send_to_recv) / 3600
+    shuffled_response_times_send_to_recv = np.array(shuffled_response_times_send_to_recv) / 3600
+
+    print("Node was a sender, how long until it receives?")
+    print(f"Mean actual response time: {np.mean(response_times_send_to_recv):.2f} hours")
+    print(f"Mean random response time: {np.mean(shuffled_response_times_send_to_recv):.2f} hours")
+
+    # plt.title("Response Time: Sender to Receiver")
+    plt.hist(response_times_send_to_recv, alpha=0.5, density=True, label="Actual Response Time", color="blue")
+    plt.axvline(np.mean(response_times_send_to_recv), alpha=0.7, label = f"Mean Actual Response Time: {np.mean(response_times_send_to_recv):.2f} h", color="blue")
+    plt.hist(shuffled_response_times_send_to_recv, alpha=0.5, density=True, label="Shuffled Response Time", color="red")
+    plt.axvline(np.mean(shuffled_response_times_send_to_recv), alpha=0.7, label = f"Mean Shuffled Response Time: {np.mean(shuffled_response_times_send_to_recv):.2f} h", color="red")
+    plt.xlabel('hours')
+    plt.legend()
+    plt.show()
\ No newline at end of file