1+ import json
2+ import os
3+ from urllib .parse import urlparse
4+ import shutil
5+
6+ def normalize_url (url ):
7+ """
8+ Normalize URL by removing protocol, trailing slashes, and standardizing www.
9+ """
10+ parsed = urlparse (url )
11+ # Remove 'www.' if present
12+ netloc = parsed .netloc .replace ('www.' , '' )
13+ # Combine with path and remove trailing slashes
14+ return netloc + parsed .path .rstrip ('/' )
15+
16+ def normalize_link (link ):
17+ """
18+ Normalize a link URL in the same way as the main site URL.
19+ """
20+ parsed = urlparse (link )
21+ netloc = parsed .netloc .replace ('www.' , '' )
22+ # Preserve query parameters and fragments in links
23+ normalized = netloc + parsed .path .rstrip ('/' )
24+ if parsed .query :
25+ normalized += '?' + parsed .query
26+ if parsed .fragment :
27+ normalized += '#' + parsed .fragment
28+ return normalized
29+
30+ def merge_sites (sites ):
31+ """Merge sites that have the same URL but different protocols or www prefix."""
32+ # Group sites by site name and normalized URL
33+ site_groups = {}
34+ for site in sites :
35+ norm_url = normalize_url (site ['Site URL' ])
36+ site_name = site ['Site Name' ]
37+ key = (site_name , norm_url )
38+ if key not in site_groups :
39+ site_groups [key ] = []
40+ site_groups [key ].append (site )
41+
42+ # Merge sites that need to be merged
43+ merged_sites = []
44+ for sites_group in site_groups .values ():
45+ if len (sites_group ) == 1 :
46+ merged_sites .append (sites_group [0 ])
47+ else :
48+ # Prefer https over http for the main site URL
49+ https_site = next ((site for site in sites_group if site ['Site URL' ].startswith ('https' )), None )
50+ base_site = https_site if https_site else sites_group [0 ]
51+
52+ # Merge all links from all versions and normalize them
53+ all_links = set ()
54+ for site in sites_group :
55+ # Normalize each link to handle www consistently
56+ normalized_links = [link for link in site ['Links' ]]
57+ all_links .update (normalized_links )
58+
59+ # Create merged site entry
60+ merged_site = base_site .copy ()
61+ merged_site ['Links' ] = sorted (list (all_links ))
62+ merged_sites .append (merged_site )
63+
64+ return merged_sites
65+
66+ def process_file (input_path , output_path ):
67+ """Process a single JSON file."""
68+ with open (input_path , 'r' , encoding = 'utf-8' ) as f :
69+ data = json .load (f )
70+
71+ # Merge sites
72+ data ['Sites' ] = merge_sites (data ['Sites' ])
73+
74+ # Write the processed data
75+ with open (output_path , 'w' , encoding = 'utf-8' ) as f :
76+ json .dump (data , f , ensure_ascii = False , indent = 4 )
77+
78+ def main (input_dir , output_dir ):
79+ """Process all JSON files in the input directory."""
80+ # Create output directory if it doesn't exist
81+ os .makedirs (output_dir , exist_ok = True )
82+
83+ # Process each JSON file
84+ for filename in os .listdir (input_dir ):
85+ if filename .endswith ('.json' ):
86+ input_path = os .path .join (input_dir , filename )
87+ output_path = os .path .join (output_dir , filename )
88+ process_file (input_path , output_path )
89+ print (f"Processed { filename } " )
90+
91+ if __name__ == "__main__" :
92+ import sys
93+
94+
95+
96+ input_dir = "output/robots_filtered" # Replace with your input directory path
97+ output_dir = "output/http_merged"
98+
99+ if not os .path .exists (input_dir ):
100+ print (f"Error: Input directory '{ input_dir } ' does not exist" )
101+ sys .exit (1 )
102+
103+ main (input_dir , output_dir )
0 commit comments