Skip to content

Commit e1251d3

Browse files
author
Abdullah Al Sefat
committed
filtering
1 parent a884638 commit e1251d3

File tree

3 files changed

+245
-0
lines changed

3 files changed

+245
-0
lines changed

result_filtering/http_merge_2.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import json
2+
import os
3+
from urllib.parse import urlparse
4+
import shutil
5+
6+
def normalize_url(url):
7+
"""
8+
Normalize URL by removing protocol, trailing slashes, and standardizing www.
9+
"""
10+
parsed = urlparse(url)
11+
# Remove 'www.' if present
12+
netloc = parsed.netloc.replace('www.', '')
13+
# Combine with path and remove trailing slashes
14+
return netloc + parsed.path.rstrip('/')
15+
16+
def normalize_link(link):
17+
"""
18+
Normalize a link URL in the same way as the main site URL.
19+
"""
20+
parsed = urlparse(link)
21+
netloc = parsed.netloc.replace('www.', '')
22+
# Preserve query parameters and fragments in links
23+
normalized = netloc + parsed.path.rstrip('/')
24+
if parsed.query:
25+
normalized += '?' + parsed.query
26+
if parsed.fragment:
27+
normalized += '#' + parsed.fragment
28+
return normalized
29+
30+
def merge_sites(sites):
31+
"""Merge sites that have the same URL but different protocols or www prefix."""
32+
# Group sites by site name and normalized URL
33+
site_groups = {}
34+
for site in sites:
35+
norm_url = normalize_url(site['Site URL'])
36+
site_name = site['Site Name']
37+
key = (site_name, norm_url)
38+
if key not in site_groups:
39+
site_groups[key] = []
40+
site_groups[key].append(site)
41+
42+
# Merge sites that need to be merged
43+
merged_sites = []
44+
for sites_group in site_groups.values():
45+
if len(sites_group) == 1:
46+
merged_sites.append(sites_group[0])
47+
else:
48+
# Prefer https over http for the main site URL
49+
https_site = next((site for site in sites_group if site['Site URL'].startswith('https')), None)
50+
base_site = https_site if https_site else sites_group[0]
51+
52+
# Merge all links from all versions and normalize them
53+
all_links = set()
54+
for site in sites_group:
55+
# Normalize each link to handle www consistently
56+
normalized_links = [link for link in site['Links']]
57+
all_links.update(normalized_links)
58+
59+
# Create merged site entry
60+
merged_site = base_site.copy()
61+
merged_site['Links'] = sorted(list(all_links))
62+
merged_sites.append(merged_site)
63+
64+
return merged_sites
65+
66+
def process_file(input_path, output_path):
67+
"""Process a single JSON file."""
68+
with open(input_path, 'r', encoding='utf-8') as f:
69+
data = json.load(f)
70+
71+
# Merge sites
72+
data['Sites'] = merge_sites(data['Sites'])
73+
74+
# Write the processed data
75+
with open(output_path, 'w', encoding='utf-8') as f:
76+
json.dump(data, f, ensure_ascii=False, indent=4)
77+
78+
def main(input_dir, output_dir):
79+
"""Process all JSON files in the input directory."""
80+
# Create output directory if it doesn't exist
81+
os.makedirs(output_dir, exist_ok=True)
82+
83+
# Process each JSON file
84+
for filename in os.listdir(input_dir):
85+
if filename.endswith('.json'):
86+
input_path = os.path.join(input_dir, filename)
87+
output_path = os.path.join(output_dir, filename)
88+
process_file(input_path, output_path)
89+
print(f"Processed {filename}")
90+
91+
if __name__ == "__main__":
92+
import sys
93+
94+
95+
96+
input_dir = "output/robots_filtered" # Replace with your input directory path
97+
output_dir = "output/http_merged"
98+
99+
if not os.path.exists(input_dir):
100+
print(f"Error: Input directory '{input_dir}' does not exist")
101+
sys.exit(1)
102+
103+
main(input_dir, output_dir)
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import json
2+
import os
3+
from urllib.parse import urlparse, urlunparse
4+
5+
def normalize_url(url):
6+
"""
7+
Normalize URL by removing the fragment (hashtag) portion.
8+
Returns the URL without the fragment.
9+
"""
10+
parsed = urlparse(url)
11+
# Create new parsed URL without fragment
12+
clean_parsed = parsed._replace(fragment='')
13+
return urlunparse(clean_parsed)
14+
15+
def deduplicate_links(links, strip_all_fragments=True):
16+
"""
17+
Deduplicate links by removing URLs that are the same except for hashtags.
18+
19+
Args:
20+
links: List of URLs to process
21+
strip_all_fragments: If True, strips fragments from all URLs.
22+
If False, only deduplicates based on normalized URLs.
23+
"""
24+
# Create a dictionary to store unique normalized URLs
25+
unique_urls = {}
26+
27+
# For each link, store only the first occurrence of its normalized version
28+
for link in links:
29+
normalized = normalize_url(link)
30+
if normalized not in unique_urls:
31+
# If we're stripping all fragments, store the normalized URL
32+
# Otherwise, store the original URL with fragment
33+
unique_urls[normalized] = normalized if strip_all_fragments else link
34+
35+
# Return the deduplicated links in the same order they first appeared
36+
return list(unique_urls.values())
37+
38+
def process_file(input_path, output_path, strip_all_fragments=True):
39+
"""Process a single JSON file to deduplicate links."""
40+
with open(input_path, 'r', encoding='utf-8') as f:
41+
data = json.load(f)
42+
43+
# Process each site's links
44+
for site in data.get('Sites', []):
45+
if 'Links' in site:
46+
site['Links'] = deduplicate_links(site['Links'], strip_all_fragments)
47+
48+
# Write the processed data
49+
with open(output_path, 'w', encoding='utf-8') as f:
50+
json.dump(data, f, ensure_ascii=False, indent=4)
51+
52+
def main():
53+
# Specify your input and output directories here
54+
input_dir = "output/http_merged" # Replace with your input directory path
55+
output_dir = "output/deduplication" # Replace with your output directory path
56+
57+
# Set this to True to completely strip all fragments
58+
strip_all_fragments = True
59+
60+
# Create output directory if it doesn't exist
61+
os.makedirs(output_dir, exist_ok=True)
62+
63+
# Process each JSON file
64+
for filename in os.listdir(input_dir):
65+
if filename.endswith('.json'):
66+
input_path = os.path.join(input_dir, filename)
67+
output_path = os.path.join(output_dir, filename)
68+
69+
try:
70+
process_file(input_path, output_path, strip_all_fragments)
71+
print(f"Processed {filename}")
72+
except Exception as e:
73+
print(f"Error processing {filename}: {str(e)}")
74+
75+
if __name__ == "__main__":
76+
main()

result_filtering/remove_hash.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import json
2+
import os
3+
from urllib.parse import urlparse, urlunparse
4+
5+
def normalize_url(url):
6+
"""
7+
Normalize URL by removing the fragment (hashtag) portion.
8+
Returns the URL without the fragment.
9+
"""
10+
parsed = urlparse(url)
11+
# Create new parsed URL without fragment
12+
clean_parsed = parsed._replace(fragment='')
13+
return urlunparse(clean_parsed)
14+
15+
def deduplicate_links(links):
16+
"""
17+
Deduplicate links by removing URLs that are the same except for hashtags.
18+
"""
19+
# Create a dictionary to store unique normalized URLs
20+
unique_urls = {}
21+
22+
# For each link, store only the first occurrence of its normalized version
23+
for link in links:
24+
normalized = normalize_url(link)
25+
if normalized not in unique_urls:
26+
unique_urls[normalized] = link
27+
28+
# Return the deduplicated links in the same order they first appeared
29+
return list(unique_urls.values())
30+
31+
def process_file(input_path, output_path):
32+
"""Process a single JSON file to deduplicate links."""
33+
with open(input_path, 'r', encoding='utf-8') as f:
34+
data = json.load(f)
35+
36+
# Process each site's links
37+
for site in data.get('Sites', []):
38+
if 'Links' in site:
39+
site['Links'] = deduplicate_links(site['Links'])
40+
41+
# Write the processed data
42+
with open(output_path, 'w', encoding='utf-8') as f:
43+
json.dump(data, f, ensure_ascii=False, indent=4)
44+
45+
def main():
46+
# Specify your input and output directories here
47+
input_dir = "output/http_merged" # Replace with your input directory path
48+
output_dir = "output/deduplication" # Replace with your output directory path
49+
50+
# Create output directory if it doesn't exist
51+
os.makedirs(output_dir, exist_ok=True)
52+
53+
# Process each JSON file
54+
for filename in os.listdir(input_dir):
55+
if filename.endswith('.json'):
56+
input_path = os.path.join(input_dir, filename)
57+
output_path = os.path.join(output_dir, filename)
58+
59+
try:
60+
process_file(input_path, output_path)
61+
print(f"Processed {filename}")
62+
except Exception as e:
63+
print(f"Error processing {filename}: {str(e)}")
64+
65+
if __name__ == "__main__":
66+
main()

0 commit comments

Comments
 (0)