Skip to content
Snippets Groups Projects
Commit e509a90e authored by Leonie Schafferhans's avatar Leonie Schafferhans
Browse files

Upload New File

parent 4d3c7b32
No related branches found
No related tags found
No related merge requests found
import git2net
from collections import defaultdict, deque
import matplotlib
import pathpyG as pp
import matplotlib.pyplot as plt
import datetime
def disambiguate_aliases(sqlite_db_file):
'''
Diambiguate aliases in the database
'''
git2net.disambiguate_aliases_db(sqlite_db_file)
def get_coediting_network(sqlite_db_file, author_identifier="author_name"):
'''
extract co-editing network from the database
'''
return git2net.get_coediting_network(sqlite_db_file) # , author_identifier=author_identifier)
def build_weighted_event_graph(events, delta):
"""
Build a weighted Event Graph with edges ocurring in a given delta time window time
after https://www.nature.com/articles/s41598-018-29577-2
"""
events = sorted(events, key=lambda x: x[2])
events_graph = defaultdict(list)
for i, (u, v, t) in enumerate(events):
for j in range(i + 1, len(events)):
u_j, v_j, t_j = events[j]
if t_j - t > delta:
break
if v == u_j:
edge = ((u_j, v_j, t_j), t_j - t) # (source, target, timestamp), timedifference)
if (u, v, t) not in events_graph:
events_graph[(u, v, t)] = []
if edge not in events_graph[(u, v, t)]:
events_graph[(u, v, t)].append(edge)
return events_graph # (u,v,t): [(u,x,t),(u,w,t)....]
def threshold_event_graph(g, delta):
'''
filter out all events that exceed the delta
'''
threshold_g = defaultdict(list)
for node, neighbors in list(g.items()):
for neighbor, weight in neighbors:
if weight <= delta:
threshold_g[node].append(neighbor)
else:
break
return threshold_g
def find_strongly_ccs(edges):
'''
finding SCCs forward in time
'''
index = 0
stack = []
on_stack = set()
disc = {}
low = {}
sccs = []
def tarjan_scc(node, time):
nonlocal index
disc[node] = low[node] = index
index += 1
stack.append(node)
on_stack.add(node)
for u, v, t in edges:
if u == node and t >= time:
if v not in disc:
tarjan_scc(v, t)
low[node] = min(low[node], low[v])
elif v in on_stack and disc[v] <= low[node]:
low[node] = min(low[node], disc[v])
if low[node] == disc[node]:
scc = []
while stack:
w = stack.pop()
on_stack.remove(w)
scc.append(w)
if w == node:
break
sccs.append(scc)
nodes = set()
for u, v, t in edges:
nodes.add(u)
nodes.add(v)
for node in nodes:
if node not in disc:
tarjan_scc(node, 0)
return sccs
def has_time_respecting_path(edges, u, v, start_time=0):
"""
time-respecting path search using adjacency lists.
"""
adj_list = defaultdict(list)
for src, tgt, time in edges:
adj_list[src].append((tgt, time))
queue = deque([(u, start_time)])
visited = set()
while queue:
curr, curr_time = queue.popleft()
if curr == v:
return True
if (curr, curr_time) in visited:
continue
visited.add((curr, curr_time))
for neighbor, time in adj_list[curr]:
if time >= curr_time:
queue.append((neighbor, time))
return False
def build_reachability_matrix(edges, nodes):
matrix = {u: {v: False for v in nodes} for u in nodes}
for u in nodes:
for v in nodes:
if u != v:
matrix[u][v] = has_time_respecting_path(edges, u, v)
return matrix
def merge_sccs(sccs, reachability_matrix):
"""
refine SCCs by merging
"""
num_sccs = len(sccs)
merged = [False] * num_sccs
new_sccs = []
for i in range(num_sccs):
if merged[i]:
continue
current_scc = sccs[i]
to_merge = [i]
for j in range(i + 1, num_sccs):
if merged[j]:
continue
can_merge = True
for node_i in current_scc:
for node_j in sccs[j]:
if not (reachability_matrix[node_i][node_j] and reachability_matrix[node_j][node_i]):
can_merge = False
break
if not can_merge:
break
if can_merge:
to_merge.append(j)
merged_scc = []
for k in to_merge:
merged_scc.extend(sccs[k])
merged[k] = True
new_sccs.append(list(set(merged_scc)))
return new_sccs
def compute_max_LCC_percentage(components, no_nodes):
"""
calculate largest component percentage
"""
if not components:
return 0
max_component = max(components, key=len)
return len(max_component) / no_nodes if no_nodes > 0 else 0
def process_delta(delta, event_graph, no_nodes, nodes):
"""
processing one delta
"""
try:
thresholded_g = threshold_event_graph(event_graph, delta)
edges = []
for (u, v, t), neighbors in thresholded_g.items():
edges.append((u, v, t))
for _, v_n, timestamp in neighbors:
edges.append((v, v_n, timestamp))
reachability_matrix = build_reachability_matrix(tuple(edges), set(nodes))
sccs = find_strongly_ccs(edges)
refined_sccs = sccs
while True:
refined_sccs = merge_sccs(refined_sccs, reachability_matrix)
if len(refined_sccs) == len(sccs):
break
sccs = refined_sccs
return compute_max_LCC_percentage(components=refined_sccs, no_nodes=no_nodes)
except Exception as e:
print('Error at process Delta')
print(e)
return 0
def analyze_dataset(sqlite_db_file, deltas, db_name, delta_unit='SEC'):
"""
analyzing and plotting for .db file with given delta
"""
disambiguate_aliases(sqlite_db_file)
t, _, _ = get_coediting_network(sqlite_db_file)
tedges = sorted(set(t.tedges), key=lambda x: x[2])
t = pp.TemporalGraph.from_edge_list(tedges)
no_nodes = t.n
full_event_graph = build_weighted_event_graph(tedges, max(deltas))
LCC_percentage = [process_delta(delta, full_event_graph, no_nodes, t.nodes) for delta in deltas]
units = {'DAY': (60 * 60 * 24, 'days'), 'MIN': (60, 'minutes'), 'SEC': (1, 'seconds')}
factor, label = units.get(delta_unit, (1, 'seconds'))
deltas = [delta / factor for delta in deltas]
plt.plot(deltas, LCC_percentage, label='LCC Percentage')
plt.xlabel(f'Delta (in {label})')
plt.ylabel('Relative largest SCC')
plt.title(db_name)
plt.grid(True)
plt.show()
def analyze_database_rolling_time_window(sqlite_db_file, deltas, deltas_str, window_size, step_size, db_name):
"""
function for generating windowed plots
"""
git2net.disambiguate_aliases_db(sqlite_db_file)
x, _, _ = git2net.get_coediting_network(sqlite_db_file) # , #author_identifier='author_name')
unique_temporal_edges = sorted(set(x.tedges), key=lambda x: x[2])
t = pp.TemporalGraph.from_edge_list(unique_temporal_edges)
no_nodes = t.n
starttime = t.start_time
endtime = t.end_time
current_time = starttime
results = {delta: [] for delta in deltas}
while current_time + window_size <= endtime:
window_endtime = current_time + window_size
window_edges = [edge for edge in unique_temporal_edges if current_time <= edge[2] < window_endtime]
unique_nodes_window = set(node for edge in window_edges for node in edge[:2])
num_unique_nodes_window = len(unique_nodes_window)
if window_edges:
for delta in deltas:
event_graph = build_weighted_event_graph(window_edges, delta)
scc_percentage = process_delta(delta, event_graph, num_unique_nodes_window, t.nodes)
global_scc_percentage = process_delta(delta, event_graph, no_nodes, t.nodes)
results[delta].append({
"start_time": datetime.datetime.fromtimestamp(current_time, tz=datetime.timezone.utc),
"end_time": datetime.datetime.fromtimestamp(window_endtime, tz=datetime.timezone.utc),
"scc_percentage": scc_percentage,
"scc_percentage_global": global_scc_percentage,
"num_unique_nodes_window": num_unique_nodes_window
})
current_time += step_size
fig, axes = plt.subplots(2, 3, figsize=(18, 10), sharex=True, sharey=True)
colors = ['b', 'g', 'r', 'c', 'm', 'orange']
for idx, delta in enumerate(deltas):
x_values = [res['start_time'] for res in results[delta]]
y_values = [res["scc_percentage"] for res in results[delta]]
z_vals = [res['scc_percentage_global'] for res in results[delta]]
row, col = divmod(idx, 3)
ax = axes[row, col]
color = colors[idx % len(colors)]
ax.step(x_values, y_values, where='post', linestyle='-', label=f"LCC % in Window Δ={deltas_str[idx]}",
color=color)
ax.step(x_values, z_vals, where='post', linestyle='--', label=f"LCC % Global Δ={deltas_str[idx]}", color=color,
alpha=0.7)
ax.set_title(f"LCC Evolution Δ={deltas_str[idx]}")
ax.grid(alpha=.6)
ax.legend()
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%d-%m-%Y'))
ax.xaxis.set_major_locator(matplotlib.dates.AutoDateLocator())
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
for ax in axes[-1]:
ax.set_xlabel("Date")
for ax in axes[:, 0]:
ax.set_ylabel("LCC %")
plt.tight_layout()
plt.show()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment