From 54f1e7d5c0e2a49c53e822547b4f5536e7c7690c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Sch=C3=BCler?= <moritz.schueler@stud-mail.uni-wuerzburg.de> Date: Tue, 18 Mar 2025 16:00:24 +0100 Subject: [PATCH] Upload New File --- "Moritz Sch\303\274ler/interevent_time.py" | 434 +++++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 "Moritz Sch\303\274ler/interevent_time.py" diff --git "a/Moritz Sch\303\274ler/interevent_time.py" "b/Moritz Sch\303\274ler/interevent_time.py" new file mode 100644 index 0000000..24dbec3 --- /dev/null +++ "b/Moritz Sch\303\274ler/interevent_time.py" @@ -0,0 +1,434 @@ +import numpy as np +import pathpyG as pp +import matplotlib.pyplot as plt +from scipy.stats import poisson, zipf, norm, lognorm, kstest, chisquare +from tqdm import tqdm +import random + + +def mine_interevent_times(graph: pp.TemporalGraph, time_aggregation_list: list = [1,1,1]) -> dict: + interevent_time_dict = {} + edge_interevent_time_dict = {} + node_interevent_time_dict = {} + interevent_time_dict['global'] = global_interevent_time(graph, time_aggregation_list[0]) + + edge_intereventtime = edge_interevent_time(graph, time_aggregation_list[1]) + edge_interevent_time_dict['aggregate'] = edge_intereventtime[1] + edge_interevent_time_dict['individual'] = edge_intereventtime[0] + interevent_time_dict['edge'] = edge_interevent_time_dict + + node_intereventtime = node_interevent_time(graph, time_aggregation_list[2]) + node_interevent_time_dict['aggregate'] = node_intereventtime[1] + node_interevent_time_dict['individual'] = node_intereventtime[0] + interevent_time_dict['node'] = node_interevent_time_dict + + return interevent_time_dict + +def ks_test(data, dist): + if dist == 'lognorm': + data = np.array(data) + data = data + 1e-9 + shape, loc, scale = lognorm.fit(data, floc=0) + ks_stat, ks_p_value = kstest(data, 'lognorm', args=(shape, loc, scale)) + print('--------------------') + print('KS test lognorm') + print(f"KS Statistic: {ks_stat}") + print(f"P-Value: {ks_p_value}") + + elif dist == 'norm': + mu, sigma = norm.fit(data) + ks_stat, ks_p_value = kstest(data, 'norm', args=(mu, sigma)) + print('--------------------') + print('KS test norm') + print(f"KS Statistic: {ks_stat}") + print(f"P-Value: {ks_p_value}") + + elif dist == 'poisson': + lambda_est = np.mean(data) + poisson_dist = poisson(mu=lambda_est) + ks_stat, ks_p_value = kstest(data, poisson_dist.cdf) + print('--------------------') + print('KS test poisson') + print(f"KS Statistic: {ks_stat}") + print(f"P-Value: {ks_p_value}") + + elif dist == 'zipf': + alpha_est = 2 # Example; use MLE or other methods to estimate + zipf_dist = zipf(a=alpha_est) + ks_stat, ks_p_value = kstest(data, zipf_dist.cdf) + print('--------------------') + print('KS test zipf') + print(f"KS Statistic: {ks_stat}") + print(f"P-Value: {ks_p_value}") + +def chisquare_test(data, dist): + match dist: + case 'poisson': + unique, observed_frequencies = np.unique(data, return_counts=True) + lambda_poisson = np.mean(data) + expected_frequencies_poisson = len(data) * poisson.pmf(unique, lambda_poisson) + chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_poisson, sum_check=False, ddof=1) + print('--------------------') + print('chisquare test poisson') + print(f"chisquare Statistic: {chi2_stat}") + print(f"P-Value: {chi2_p_value}") + case 'zipf': + unique, observed_frequencies = np.unique(data, return_counts=True) + s_zipf = 1.1 + expected_frequencies_zipf = len(data) * zipf.pmf(unique, s_zipf) + chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_zipf, sum_check=False, ddof=1) + print('--------------------') + print('chisquare test zipf') + print(f"chisquare Statistic: {chi2_stat}") + print(f"P-Value: {chi2_p_value}") + case 'norm': + unique, observed_frequencies = np.unique(data, return_counts=True) + mean_normal = np.mean(data) + std_normal = np.std(data, ddof=1) + expected_frequencies_normal = len(data) * norm.pdf(unique, mean_normal, std_normal) + chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_normal, sum_check=False, ddof=2) + print('--------------------') + print('chisquare test normal') + print(f"chisquare Statistic: {chi2_stat}") + print(f"P-Value: {chi2_p_value}") + case 'lognorm': + unique, observed_frequencies = np.unique(data, return_counts=True) + shape_lognormal, loc_lognormal, scale_lognormal = lognorm.fit(data, floc=0) # Fit Lognormal parameters + expected_frequencies_lognormal = len(data) * lognorm.pdf(unique, shape_lognormal, loc_lognormal, scale_lognormal) + chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_lognormal, sum_check=False, ddof=2) + print('--------------------') + print('chisquare test lognormal') + print(f"chisquare Statistic: {chi2_stat}") + print(f"P-Value: {chi2_p_value}") + + +def fourier(graph: pp.TemporalGraph, time_aggregation): + time_data = graph.data.time.tolist() + time_data = np.unique(time_data) + time_data = time_data/time_aggregation + time_data = np.array([int(t) for t in time_data]) + hist, _ = np.histogram(time_data, np.arange(min(time_data), max(time_data) + 1)) + + fft_result = np.fft.fft(hist) + frequencies = np.fft.fftfreq(len(hist), d=1) + + plt.plot(frequencies[:len(frequencies)//2], np.abs(fft_result)[:len(frequencies)//2]) + plt.axvline(time_aggregation/(60*60*24), label = 'daily pattern', color = 'r', alpha = 0.4) + plt.axvline(time_aggregation/(60*60*24*7), label = 'weekly pattern', color = 'g', alpha = 0.4) + plt.xlabel(f'Frequency in 1/{time_aggregation}s') + plt.ylabel('Amplitude') + # plt.title('Frequency Spectrum') + plt.legend() + plt.show() + +def plot_event_time_histogram(graph: pp.TemporalGraph): + time_aggregation = 60*60*24 + time_data = graph.data.time.tolist() + time_data = np.unique(time_data) + time_data = time_data/time_aggregation + time_data = np.array([int(t) for t in time_data]) + values, counts = np.unique(time_data, return_counts=True) + plt.figure(figsize=(25,6)) + plt.bar(values, counts, 1) + time_range = int((time_data[-1] - time_data[0])/365) + plt.xticks([time_data[0] + i*365 for i in range(time_range+1)], range(time_range+1)) + plt.xlabel('years since start') + plt.ylabel('counts') + # plt.title('events aggregated to days since start of the project') + plt.show() + + values, counts = np.unique(time_data, return_counts=True) + plt.figure(figsize=(25,6)) + plt.bar(values, counts, 1) + time_range = int((time_data[-1] - time_data[0])) + plt.xticks([time_data[0] + i for i in range(time_range+1) if i%7 ==0], range(0, time_range+1, 7), rotation = 90) + plt.xlim(min(values), min(values) + 365) + plt.xlabel('days since start') + plt.ylabel('counts') + # plt.title('events aggregated to days since start of the project') + plt.show() + + + +def plot_interevent_time_histograms(global_iets, edge_iets, node_iets): + plt.figure(figsize=(25,10)) + values, counts = np.unique(global_iets, return_counts=True) + plt.bar(values, counts, 4) + # plt.title('global inter-event times') + plt.ylabel('count') + plt.xlabel('interevent time in hours') + + plt.show() + plt.figure(figsize=(25,10)) + values, counts = np.unique(edge_iets, return_counts=True) + plt.bar(values, counts, 2) + # plt.title('edge interevent times') + plt.ylabel('count') + plt.xlabel('interevent time in days') + plt.show() + plt.figure(figsize=(25,10)) + values, counts = np.unique(node_iets, return_counts=True) + plt.bar(values, counts, 2) + # plt.title('node interevent times') + plt.ylabel('count') + plt.xlabel('interevent time in days') + plt.show() + +def plot_distribution_comparison(interevent_times: list, level): + avg_interevent_time = np.mean(interevent_times) + std_interevent_time = np.std(interevent_times) + + values, counts = np.unique(interevent_times, return_counts=True) + total_count = np.sum(counts) + + k_values = np.arange(0, max(interevent_times)) + x_values = np.linspace(0, max(interevent_times), 20000) + + #poisson + poisson_pmf = poisson.pmf(k_values, mu=avg_interevent_time) + poisson_pmf = poisson_pmf * total_count + #zipf + zipf_param = 1.1 + zipf_pmf = zipf.pmf(k_values, a=zipf_param) + zipf_pmf = zipf_pmf * total_count + #normal + normal_pdf = norm.pdf(x_values, loc=avg_interevent_time, scale=std_interevent_time) + normal_pdf = normal_pdf * total_count + #log-normal + interevent_times = np.array(interevent_times) + interevent_times = interevent_times + 1e-9 + shape, loc, scale = lognorm.fit(interevent_times, floc = 0) + lognormal_pdf = lognorm.pdf(x_values, s=shape, loc=loc, scale=scale) + lognormal_pdf = lognormal_pdf * total_count + + plt.figure(figsize=(12, 8)) + plt.plot(x_values, lognormal_pdf, 'c-', label='Log-Normal PDF', alpha = 1) + plt.plot(x_values, normal_pdf, 'm-', label=f'Normal PDF (μ={avg_interevent_time:.2f}, σ={std_interevent_time:.2f})', alpha = 1) + plt.plot(k_values, zipf_pmf, 'g-', label=f'Zipf PMF (a={zipf_param:.2f})', alpha = 1) + plt.plot(k_values, poisson_pmf, 'r-', label=f'Poisson PMF (λ={avg_interevent_time:.2f})', alpha = 1) + # plt.hist(interevent_times, density=True, color='blue', alpha=0.6, label='Interevent Times') + if level == 'global': + plt.xlabel('interevent time in hours') + else: + plt.xlabel('interevent time in days') + plt.ylabel('count') + + plt.bar(values, counts, 2, label='Interevent Times', alpha = 0.5) + # plt.title(f'Comparison of Interevent Time Distribution with various theoretical distributions') + plt.legend() + plt.xlim(0, 50) + plt.ylim(0, max(counts)) + plt.grid(axis='y', linestyle='--', alpha=0.7) + plt.show() + +def distribution_comparison_and_hypothesis_testing(interevent_time_dict, interevent_time_dict_raw): + plot_distribution_comparison(interevent_time_dict['global'], 'global') + chisquare_test(interevent_time_dict_raw['global'], 'norm') + chisquare_test(interevent_time_dict_raw['global'], 'lognorm') + chisquare_test(interevent_time_dict_raw['global'], 'poisson') + chisquare_test(interevent_time_dict_raw['global'], 'zipf') + ks_test(interevent_time_dict_raw['global'], 'norm') + ks_test(interevent_time_dict_raw['global'], 'lognorm') + + plot_distribution_comparison(interevent_time_dict['edge']['aggregate'], 'edge') + chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'norm') + chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'lognorm') + chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'poisson') + chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'zipf') + ks_test(interevent_time_dict_raw['edge']['aggregate'], 'norm') + ks_test(interevent_time_dict_raw['edge']['aggregate'], 'lognorm') + + plot_distribution_comparison(interevent_time_dict['node']['aggregate'], 'node') + chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'norm') + chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'lognorm') + chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'poisson') + chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'zipf') + ks_test(interevent_time_dict_raw['node']['aggregate'], 'norm') + ks_test(interevent_time_dict_raw['node']['aggregate'], 'lognorm') + + + + + +def global_interevent_time(graph: pp.TemporalGraph, time_aggregation: int = 1, ignore_zero_it = True) -> list: + time_data = graph.data.time + interevent_times = time_data[1:] - time_data[:-1] + if ignore_zero_it: + interevent_times = interevent_times[interevent_times > 0] + + interevent_times = interevent_times/time_aggregation + interevent_times = np.array([int(t) for t in interevent_times]) + + interevent_times = [t.item() for t in interevent_times] + + return interevent_times + +def edge_interevent_time(graph: pp.TemporalGraph, time_aggregation: int = 1) -> tuple[dict, list]: + sender_nodes = list(graph.data.edge_index[0]) + sender_nodes = [node.item() for node in sender_nodes] + receiver_nodes = list(graph.data.edge_index[1]) + receiver_nodes = [node.item() for node in receiver_nodes] + + edges = list(zip(sender_nodes, receiver_nodes)) + time_data = graph.data.time + time_data = list(time_data) + interevent_time_dict = {edge: [] for edge in set(edges)} + time_stamp_dict = {edge: [] for edge in set(edges)} + sum_edge_interevent_times = [] + + for i in range(len(time_data)): + edge_time_stamp = time_data[i].item() + time_stamp_dict[edges[i]].append(edge_time_stamp) + + for key in time_stamp_dict.keys(): + edge_time_stamps = time_stamp_dict[key] + if len(edge_time_stamps) > 1: + edge_interevent_times = (np.array(edge_time_stamps[1:]) - np.array(edge_time_stamps[:-1])) + edge_interevent_times = edge_interevent_times[edge_interevent_times > 0] + edge_interevent_times = edge_interevent_times/time_aggregation + edge_interevent_times = np.array([int(t) for t in edge_interevent_times]) + edge_interevent_times = [t.item() for t in edge_interevent_times] + if len(edge_interevent_times) > 0: + interevent_time_dict[key] = edge_interevent_times + sum_edge_interevent_times += edge_interevent_times + + + + + return interevent_time_dict, sum_edge_interevent_times + +def node_interevent_time(graph: pp.TemporalGraph, time_aggregation: int = 1) -> tuple[dict, list]: + sender_nodes = list(graph.data.edge_index[0]) + sender_nodes = [node.item() for node in sender_nodes] + receiver_nodes = list(graph.data.edge_index[1]) + receiver_nodes = [node.item() for node in receiver_nodes] + + time_data = graph.data.time + time_data = list(time_data) + nodes = set(sender_nodes + receiver_nodes) + interevent_time_dict = {node: [] for node in nodes} + time_stamp_dict = {node: [] for node in nodes} + sum_node_interevent_times = [] + + for i in range(len(time_data)): + node_time_stamp = time_data[i].item() + time_stamp_dict[sender_nodes[i]].append(node_time_stamp) + time_stamp_dict[receiver_nodes[i]].append(node_time_stamp) + + for key in time_stamp_dict.keys(): + node_time_stamps = time_stamp_dict[key] + if len(node_time_stamps) > 1: + node_interevent_times = (np.array(node_time_stamps[1:]) - np.array(node_time_stamps[:-1])) + node_interevent_times = node_interevent_times[node_interevent_times > 0] + node_interevent_times = node_interevent_times/time_aggregation + node_interevent_times = np.array([int(t) for t in node_interevent_times]) + node_interevent_times = [t.item() for t in node_interevent_times] + if len(node_interevent_times) > 0: + interevent_time_dict[key] = node_interevent_times + sum_node_interevent_times += node_interevent_times + + + + + return interevent_time_dict, sum_node_interevent_times + +def response_time(graph: pp.TemporalGraph): + + senders = list(graph.data.edge_index[0]) + senders = [node.item() for node in senders] + receivers = list(graph.data.edge_index[1]) + receivers = [node.item() for node in receivers] + timestamps = graph.data.time + + response_times_recv_to_send = [] + developer_rts_response_time_dict = dict() + + for i, receiver in enumerate(tqdm(receivers)): + for j, sender in enumerate(senders[i:]): + if receiver == sender: + response_time = timestamps[i+j] - timestamps[i] + if receiver not in developer_rts_response_time_dict: + developer_rts_response_time_dict[receiver] = [response_time] + else: + developer_rts_response_time_dict[receiver].append(response_time) + response_times_recv_to_send.append(response_time) + break + + edges = list(zip(senders, receivers)) + random.shuffle(edges) + shuffled_senders, shuffled_receivers = list(zip(*edges)) + + + shuffled_response_times_recv_to_send = [] + shuffled_developer_rts_response_time_dict = dict() + + for i, receiver in enumerate(tqdm(shuffled_receivers)): + for j, sender in enumerate(shuffled_senders[i:]): + if receiver == sender: + response_time = timestamps[i+j] - timestamps[i] + if receiver not in shuffled_developer_rts_response_time_dict: + shuffled_developer_rts_response_time_dict[receiver] = [response_time] + else: + shuffled_developer_rts_response_time_dict[receiver].append(response_time) + shuffled_response_times_recv_to_send.append(response_time) + break + + response_times_recv_to_send = np.array(response_times_recv_to_send) / 3600 + shuffled_response_times_recv_to_send = np.array(shuffled_response_times_recv_to_send) / 3600 + + print("Node was a receiver, how long until it sends?") + print(f"Mean actual response time: {np.mean(response_times_recv_to_send):.2f} hours") + print(f"Mean random response time: {np.mean(shuffled_response_times_recv_to_send):.2f} hours") + + # plt.title("Response Time: Receiver to Sender") + plt.hist(response_times_recv_to_send, alpha=0.5, density=True, label="Actual Response Time", color="blue") + plt.axvline(np.mean(response_times_recv_to_send), alpha=0.7, label = f"Mean Actual Response Time: {np.mean(response_times_recv_to_send):.2f} h", color="blue") + plt.hist(shuffled_response_times_recv_to_send, alpha=0.5, density=True, label="Shuffled Response Time", color="red") + plt.axvline(np.mean(shuffled_response_times_recv_to_send), alpha=0.7, label = f"Mean Shuffled Response Time: {np.mean(shuffled_response_times_recv_to_send):.2f} h", color="red") + plt.xlabel('hours') + plt.legend() + plt.show() + + response_times_send_to_recv = [] + developer_str_response_time_dict = dict() + + for i, sender in enumerate(tqdm(senders)): + for j, receiver in enumerate(receivers[i:]): + if receiver == sender: + response_time = timestamps[i+j] - timestamps[i] + if receiver not in developer_str_response_time_dict: + developer_str_response_time_dict[receiver] = [response_time] + else: + developer_str_response_time_dict[receiver].append(response_time) + response_times_send_to_recv.append(response_time) + break + + shuffled_response_times_send_to_recv = [] + shuffled_developer_str_response_time_dict = dict() + + for i, sender in enumerate(tqdm(shuffled_senders)): + for j, receiver in enumerate(shuffled_receivers[i:]): + if receiver == sender: + response_time = timestamps[i+j] - timestamps[i] + if receiver not in shuffled_developer_str_response_time_dict: + shuffled_developer_str_response_time_dict[receiver] = [response_time] + else: + shuffled_developer_str_response_time_dict[receiver].append(response_time) + shuffled_response_times_send_to_recv.append(response_time) + break + + response_times_send_to_recv = np.array(response_times_send_to_recv) / 3600 + shuffled_response_times_send_to_recv = np.array(shuffled_response_times_send_to_recv) / 3600 + + print("Node was a sender, how long until it receives?") + print(f"Mean actual response time: {np.mean(response_times_send_to_recv):.2f} hours") + print(f"Mean random response time: {np.mean(shuffled_response_times_send_to_recv):.2f} hours") + + # plt.title("Response Time: Sender to Receiver") + plt.hist(response_times_send_to_recv, alpha=0.5, density=True, label="Actual Response Time", color="blue") + plt.axvline(np.mean(response_times_send_to_recv), alpha=0.7, label = f"Mean Actual Response Time: {np.mean(response_times_send_to_recv):.2f} h", color="blue") + plt.hist(shuffled_response_times_send_to_recv, alpha=0.5, density=True, label="Shuffled Response Time", color="red") + plt.axvline(np.mean(shuffled_response_times_send_to_recv), alpha=0.7, label = f"Mean Shuffled Response Time: {np.mean(shuffled_response_times_send_to_recv):.2f} h", color="red") + plt.xlabel('hours') + plt.legend() + plt.show() \ No newline at end of file -- GitLab