Skip to content
Snippets Groups Projects
Commit 54f1e7d5 authored by Moritz Schüler's avatar Moritz Schüler
Browse files

Upload New File

parent ae42ce86
No related merge requests found
import numpy as np
import pathpyG as pp
import matplotlib.pyplot as plt
from scipy.stats import poisson, zipf, norm, lognorm, kstest, chisquare
from tqdm import tqdm
import random
def mine_interevent_times(graph: pp.TemporalGraph, time_aggregation_list: list = [1,1,1]) -> dict:
interevent_time_dict = {}
edge_interevent_time_dict = {}
node_interevent_time_dict = {}
interevent_time_dict['global'] = global_interevent_time(graph, time_aggregation_list[0])
edge_intereventtime = edge_interevent_time(graph, time_aggregation_list[1])
edge_interevent_time_dict['aggregate'] = edge_intereventtime[1]
edge_interevent_time_dict['individual'] = edge_intereventtime[0]
interevent_time_dict['edge'] = edge_interevent_time_dict
node_intereventtime = node_interevent_time(graph, time_aggregation_list[2])
node_interevent_time_dict['aggregate'] = node_intereventtime[1]
node_interevent_time_dict['individual'] = node_intereventtime[0]
interevent_time_dict['node'] = node_interevent_time_dict
return interevent_time_dict
def ks_test(data, dist):
if dist == 'lognorm':
data = np.array(data)
data = data + 1e-9
shape, loc, scale = lognorm.fit(data, floc=0)
ks_stat, ks_p_value = kstest(data, 'lognorm', args=(shape, loc, scale))
print('--------------------')
print('KS test lognorm')
print(f"KS Statistic: {ks_stat}")
print(f"P-Value: {ks_p_value}")
elif dist == 'norm':
mu, sigma = norm.fit(data)
ks_stat, ks_p_value = kstest(data, 'norm', args=(mu, sigma))
print('--------------------')
print('KS test norm')
print(f"KS Statistic: {ks_stat}")
print(f"P-Value: {ks_p_value}")
elif dist == 'poisson':
lambda_est = np.mean(data)
poisson_dist = poisson(mu=lambda_est)
ks_stat, ks_p_value = kstest(data, poisson_dist.cdf)
print('--------------------')
print('KS test poisson')
print(f"KS Statistic: {ks_stat}")
print(f"P-Value: {ks_p_value}")
elif dist == 'zipf':
alpha_est = 2 # Example; use MLE or other methods to estimate
zipf_dist = zipf(a=alpha_est)
ks_stat, ks_p_value = kstest(data, zipf_dist.cdf)
print('--------------------')
print('KS test zipf')
print(f"KS Statistic: {ks_stat}")
print(f"P-Value: {ks_p_value}")
def chisquare_test(data, dist):
match dist:
case 'poisson':
unique, observed_frequencies = np.unique(data, return_counts=True)
lambda_poisson = np.mean(data)
expected_frequencies_poisson = len(data) * poisson.pmf(unique, lambda_poisson)
chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_poisson, sum_check=False, ddof=1)
print('--------------------')
print('chisquare test poisson')
print(f"chisquare Statistic: {chi2_stat}")
print(f"P-Value: {chi2_p_value}")
case 'zipf':
unique, observed_frequencies = np.unique(data, return_counts=True)
s_zipf = 1.1
expected_frequencies_zipf = len(data) * zipf.pmf(unique, s_zipf)
chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_zipf, sum_check=False, ddof=1)
print('--------------------')
print('chisquare test zipf')
print(f"chisquare Statistic: {chi2_stat}")
print(f"P-Value: {chi2_p_value}")
case 'norm':
unique, observed_frequencies = np.unique(data, return_counts=True)
mean_normal = np.mean(data)
std_normal = np.std(data, ddof=1)
expected_frequencies_normal = len(data) * norm.pdf(unique, mean_normal, std_normal)
chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_normal, sum_check=False, ddof=2)
print('--------------------')
print('chisquare test normal')
print(f"chisquare Statistic: {chi2_stat}")
print(f"P-Value: {chi2_p_value}")
case 'lognorm':
unique, observed_frequencies = np.unique(data, return_counts=True)
shape_lognormal, loc_lognormal, scale_lognormal = lognorm.fit(data, floc=0) # Fit Lognormal parameters
expected_frequencies_lognormal = len(data) * lognorm.pdf(unique, shape_lognormal, loc_lognormal, scale_lognormal)
chi2_stat, chi2_p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies_lognormal, sum_check=False, ddof=2)
print('--------------------')
print('chisquare test lognormal')
print(f"chisquare Statistic: {chi2_stat}")
print(f"P-Value: {chi2_p_value}")
def fourier(graph: pp.TemporalGraph, time_aggregation):
time_data = graph.data.time.tolist()
time_data = np.unique(time_data)
time_data = time_data/time_aggregation
time_data = np.array([int(t) for t in time_data])
hist, _ = np.histogram(time_data, np.arange(min(time_data), max(time_data) + 1))
fft_result = np.fft.fft(hist)
frequencies = np.fft.fftfreq(len(hist), d=1)
plt.plot(frequencies[:len(frequencies)//2], np.abs(fft_result)[:len(frequencies)//2])
plt.axvline(time_aggregation/(60*60*24), label = 'daily pattern', color = 'r', alpha = 0.4)
plt.axvline(time_aggregation/(60*60*24*7), label = 'weekly pattern', color = 'g', alpha = 0.4)
plt.xlabel(f'Frequency in 1/{time_aggregation}s')
plt.ylabel('Amplitude')
# plt.title('Frequency Spectrum')
plt.legend()
plt.show()
def plot_event_time_histogram(graph: pp.TemporalGraph):
time_aggregation = 60*60*24
time_data = graph.data.time.tolist()
time_data = np.unique(time_data)
time_data = time_data/time_aggregation
time_data = np.array([int(t) for t in time_data])
values, counts = np.unique(time_data, return_counts=True)
plt.figure(figsize=(25,6))
plt.bar(values, counts, 1)
time_range = int((time_data[-1] - time_data[0])/365)
plt.xticks([time_data[0] + i*365 for i in range(time_range+1)], range(time_range+1))
plt.xlabel('years since start')
plt.ylabel('counts')
# plt.title('events aggregated to days since start of the project')
plt.show()
values, counts = np.unique(time_data, return_counts=True)
plt.figure(figsize=(25,6))
plt.bar(values, counts, 1)
time_range = int((time_data[-1] - time_data[0]))
plt.xticks([time_data[0] + i for i in range(time_range+1) if i%7 ==0], range(0, time_range+1, 7), rotation = 90)
plt.xlim(min(values), min(values) + 365)
plt.xlabel('days since start')
plt.ylabel('counts')
# plt.title('events aggregated to days since start of the project')
plt.show()
def plot_interevent_time_histograms(global_iets, edge_iets, node_iets):
plt.figure(figsize=(25,10))
values, counts = np.unique(global_iets, return_counts=True)
plt.bar(values, counts, 4)
# plt.title('global inter-event times')
plt.ylabel('count')
plt.xlabel('interevent time in hours')
plt.show()
plt.figure(figsize=(25,10))
values, counts = np.unique(edge_iets, return_counts=True)
plt.bar(values, counts, 2)
# plt.title('edge interevent times')
plt.ylabel('count')
plt.xlabel('interevent time in days')
plt.show()
plt.figure(figsize=(25,10))
values, counts = np.unique(node_iets, return_counts=True)
plt.bar(values, counts, 2)
# plt.title('node interevent times')
plt.ylabel('count')
plt.xlabel('interevent time in days')
plt.show()
def plot_distribution_comparison(interevent_times: list, level):
avg_interevent_time = np.mean(interevent_times)
std_interevent_time = np.std(interevent_times)
values, counts = np.unique(interevent_times, return_counts=True)
total_count = np.sum(counts)
k_values = np.arange(0, max(interevent_times))
x_values = np.linspace(0, max(interevent_times), 20000)
#poisson
poisson_pmf = poisson.pmf(k_values, mu=avg_interevent_time)
poisson_pmf = poisson_pmf * total_count
#zipf
zipf_param = 1.1
zipf_pmf = zipf.pmf(k_values, a=zipf_param)
zipf_pmf = zipf_pmf * total_count
#normal
normal_pdf = norm.pdf(x_values, loc=avg_interevent_time, scale=std_interevent_time)
normal_pdf = normal_pdf * total_count
#log-normal
interevent_times = np.array(interevent_times)
interevent_times = interevent_times + 1e-9
shape, loc, scale = lognorm.fit(interevent_times, floc = 0)
lognormal_pdf = lognorm.pdf(x_values, s=shape, loc=loc, scale=scale)
lognormal_pdf = lognormal_pdf * total_count
plt.figure(figsize=(12, 8))
plt.plot(x_values, lognormal_pdf, 'c-', label='Log-Normal PDF', alpha = 1)
plt.plot(x_values, normal_pdf, 'm-', label=f'Normal PDF (μ={avg_interevent_time:.2f}, σ={std_interevent_time:.2f})', alpha = 1)
plt.plot(k_values, zipf_pmf, 'g-', label=f'Zipf PMF (a={zipf_param:.2f})', alpha = 1)
plt.plot(k_values, poisson_pmf, 'r-', label=f'Poisson PMF (λ={avg_interevent_time:.2f})', alpha = 1)
# plt.hist(interevent_times, density=True, color='blue', alpha=0.6, label='Interevent Times')
if level == 'global':
plt.xlabel('interevent time in hours')
else:
plt.xlabel('interevent time in days')
plt.ylabel('count')
plt.bar(values, counts, 2, label='Interevent Times', alpha = 0.5)
# plt.title(f'Comparison of Interevent Time Distribution with various theoretical distributions')
plt.legend()
plt.xlim(0, 50)
plt.ylim(0, max(counts))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
def distribution_comparison_and_hypothesis_testing(interevent_time_dict, interevent_time_dict_raw):
plot_distribution_comparison(interevent_time_dict['global'], 'global')
chisquare_test(interevent_time_dict_raw['global'], 'norm')
chisquare_test(interevent_time_dict_raw['global'], 'lognorm')
chisquare_test(interevent_time_dict_raw['global'], 'poisson')
chisquare_test(interevent_time_dict_raw['global'], 'zipf')
ks_test(interevent_time_dict_raw['global'], 'norm')
ks_test(interevent_time_dict_raw['global'], 'lognorm')
plot_distribution_comparison(interevent_time_dict['edge']['aggregate'], 'edge')
chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'norm')
chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'lognorm')
chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'poisson')
chisquare_test(interevent_time_dict_raw['edge']['aggregate'], 'zipf')
ks_test(interevent_time_dict_raw['edge']['aggregate'], 'norm')
ks_test(interevent_time_dict_raw['edge']['aggregate'], 'lognorm')
plot_distribution_comparison(interevent_time_dict['node']['aggregate'], 'node')
chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'norm')
chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'lognorm')
chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'poisson')
chisquare_test(interevent_time_dict_raw['node']['aggregate'], 'zipf')
ks_test(interevent_time_dict_raw['node']['aggregate'], 'norm')
ks_test(interevent_time_dict_raw['node']['aggregate'], 'lognorm')
def global_interevent_time(graph: pp.TemporalGraph, time_aggregation: int = 1, ignore_zero_it = True) -> list:
time_data = graph.data.time
interevent_times = time_data[1:] - time_data[:-1]
if ignore_zero_it:
interevent_times = interevent_times[interevent_times > 0]
interevent_times = interevent_times/time_aggregation
interevent_times = np.array([int(t) for t in interevent_times])
interevent_times = [t.item() for t in interevent_times]
return interevent_times
def edge_interevent_time(graph: pp.TemporalGraph, time_aggregation: int = 1) -> tuple[dict, list]:
sender_nodes = list(graph.data.edge_index[0])
sender_nodes = [node.item() for node in sender_nodes]
receiver_nodes = list(graph.data.edge_index[1])
receiver_nodes = [node.item() for node in receiver_nodes]
edges = list(zip(sender_nodes, receiver_nodes))
time_data = graph.data.time
time_data = list(time_data)
interevent_time_dict = {edge: [] for edge in set(edges)}
time_stamp_dict = {edge: [] for edge in set(edges)}
sum_edge_interevent_times = []
for i in range(len(time_data)):
edge_time_stamp = time_data[i].item()
time_stamp_dict[edges[i]].append(edge_time_stamp)
for key in time_stamp_dict.keys():
edge_time_stamps = time_stamp_dict[key]
if len(edge_time_stamps) > 1:
edge_interevent_times = (np.array(edge_time_stamps[1:]) - np.array(edge_time_stamps[:-1]))
edge_interevent_times = edge_interevent_times[edge_interevent_times > 0]
edge_interevent_times = edge_interevent_times/time_aggregation
edge_interevent_times = np.array([int(t) for t in edge_interevent_times])
edge_interevent_times = [t.item() for t in edge_interevent_times]
if len(edge_interevent_times) > 0:
interevent_time_dict[key] = edge_interevent_times
sum_edge_interevent_times += edge_interevent_times
return interevent_time_dict, sum_edge_interevent_times
def node_interevent_time(graph: pp.TemporalGraph, time_aggregation: int = 1) -> tuple[dict, list]:
sender_nodes = list(graph.data.edge_index[0])
sender_nodes = [node.item() for node in sender_nodes]
receiver_nodes = list(graph.data.edge_index[1])
receiver_nodes = [node.item() for node in receiver_nodes]
time_data = graph.data.time
time_data = list(time_data)
nodes = set(sender_nodes + receiver_nodes)
interevent_time_dict = {node: [] for node in nodes}
time_stamp_dict = {node: [] for node in nodes}
sum_node_interevent_times = []
for i in range(len(time_data)):
node_time_stamp = time_data[i].item()
time_stamp_dict[sender_nodes[i]].append(node_time_stamp)
time_stamp_dict[receiver_nodes[i]].append(node_time_stamp)
for key in time_stamp_dict.keys():
node_time_stamps = time_stamp_dict[key]
if len(node_time_stamps) > 1:
node_interevent_times = (np.array(node_time_stamps[1:]) - np.array(node_time_stamps[:-1]))
node_interevent_times = node_interevent_times[node_interevent_times > 0]
node_interevent_times = node_interevent_times/time_aggregation
node_interevent_times = np.array([int(t) for t in node_interevent_times])
node_interevent_times = [t.item() for t in node_interevent_times]
if len(node_interevent_times) > 0:
interevent_time_dict[key] = node_interevent_times
sum_node_interevent_times += node_interevent_times
return interevent_time_dict, sum_node_interevent_times
def response_time(graph: pp.TemporalGraph):
senders = list(graph.data.edge_index[0])
senders = [node.item() for node in senders]
receivers = list(graph.data.edge_index[1])
receivers = [node.item() for node in receivers]
timestamps = graph.data.time
response_times_recv_to_send = []
developer_rts_response_time_dict = dict()
for i, receiver in enumerate(tqdm(receivers)):
for j, sender in enumerate(senders[i:]):
if receiver == sender:
response_time = timestamps[i+j] - timestamps[i]
if receiver not in developer_rts_response_time_dict:
developer_rts_response_time_dict[receiver] = [response_time]
else:
developer_rts_response_time_dict[receiver].append(response_time)
response_times_recv_to_send.append(response_time)
break
edges = list(zip(senders, receivers))
random.shuffle(edges)
shuffled_senders, shuffled_receivers = list(zip(*edges))
shuffled_response_times_recv_to_send = []
shuffled_developer_rts_response_time_dict = dict()
for i, receiver in enumerate(tqdm(shuffled_receivers)):
for j, sender in enumerate(shuffled_senders[i:]):
if receiver == sender:
response_time = timestamps[i+j] - timestamps[i]
if receiver not in shuffled_developer_rts_response_time_dict:
shuffled_developer_rts_response_time_dict[receiver] = [response_time]
else:
shuffled_developer_rts_response_time_dict[receiver].append(response_time)
shuffled_response_times_recv_to_send.append(response_time)
break
response_times_recv_to_send = np.array(response_times_recv_to_send) / 3600
shuffled_response_times_recv_to_send = np.array(shuffled_response_times_recv_to_send) / 3600
print("Node was a receiver, how long until it sends?")
print(f"Mean actual response time: {np.mean(response_times_recv_to_send):.2f} hours")
print(f"Mean random response time: {np.mean(shuffled_response_times_recv_to_send):.2f} hours")
# plt.title("Response Time: Receiver to Sender")
plt.hist(response_times_recv_to_send, alpha=0.5, density=True, label="Actual Response Time", color="blue")
plt.axvline(np.mean(response_times_recv_to_send), alpha=0.7, label = f"Mean Actual Response Time: {np.mean(response_times_recv_to_send):.2f} h", color="blue")
plt.hist(shuffled_response_times_recv_to_send, alpha=0.5, density=True, label="Shuffled Response Time", color="red")
plt.axvline(np.mean(shuffled_response_times_recv_to_send), alpha=0.7, label = f"Mean Shuffled Response Time: {np.mean(shuffled_response_times_recv_to_send):.2f} h", color="red")
plt.xlabel('hours')
plt.legend()
plt.show()
response_times_send_to_recv = []
developer_str_response_time_dict = dict()
for i, sender in enumerate(tqdm(senders)):
for j, receiver in enumerate(receivers[i:]):
if receiver == sender:
response_time = timestamps[i+j] - timestamps[i]
if receiver not in developer_str_response_time_dict:
developer_str_response_time_dict[receiver] = [response_time]
else:
developer_str_response_time_dict[receiver].append(response_time)
response_times_send_to_recv.append(response_time)
break
shuffled_response_times_send_to_recv = []
shuffled_developer_str_response_time_dict = dict()
for i, sender in enumerate(tqdm(shuffled_senders)):
for j, receiver in enumerate(shuffled_receivers[i:]):
if receiver == sender:
response_time = timestamps[i+j] - timestamps[i]
if receiver not in shuffled_developer_str_response_time_dict:
shuffled_developer_str_response_time_dict[receiver] = [response_time]
else:
shuffled_developer_str_response_time_dict[receiver].append(response_time)
shuffled_response_times_send_to_recv.append(response_time)
break
response_times_send_to_recv = np.array(response_times_send_to_recv) / 3600
shuffled_response_times_send_to_recv = np.array(shuffled_response_times_send_to_recv) / 3600
print("Node was a sender, how long until it receives?")
print(f"Mean actual response time: {np.mean(response_times_send_to_recv):.2f} hours")
print(f"Mean random response time: {np.mean(shuffled_response_times_send_to_recv):.2f} hours")
# plt.title("Response Time: Sender to Receiver")
plt.hist(response_times_send_to_recv, alpha=0.5, density=True, label="Actual Response Time", color="blue")
plt.axvline(np.mean(response_times_send_to_recv), alpha=0.7, label = f"Mean Actual Response Time: {np.mean(response_times_send_to_recv):.2f} h", color="blue")
plt.hist(shuffled_response_times_send_to_recv, alpha=0.5, density=True, label="Shuffled Response Time", color="red")
plt.axvline(np.mean(shuffled_response_times_send_to_recv), alpha=0.7, label = f"Mean Shuffled Response Time: {np.mean(shuffled_response_times_send_to_recv):.2f} h", color="red")
plt.xlabel('hours')
plt.legend()
plt.show()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment