Skip to content

Commit f25366b

Browse files
committed
init code
1 parent 7b781a5 commit f25366b

12 files changed

+11236
-1
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,7 @@ cython_debug/
169169

170170
# PyPI configuration file
171171
.pypirc
172+
173+
fasttext_scorers/
174+
configs/
175+
crawl_results/

README.md

+95-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,95 @@
1-
# Crawl4LLM
1+
# Crawl4LLM
2+
3+
This repo contains the code for the paper "Crawl4LLM: Efficient Web Crawling for LLM Pretraining".
4+
5+
## Prerequisite
6+
7+
1. [Request the ClueWeb22 dataset](https://lemurproject.org/clueweb22/).
8+
2. Create a virtual environment with python >= 3.10 and install the following requirements:
9+
```
10+
numpy
11+
tqdm
12+
fasttext
13+
pyyaml
14+
wandb
15+
```
16+
3. [Download the DCLM fastText classifier](https://huggingface.co/mlfoundations/fasttext-oh-eli5/tree/main) to `fasttext_scorers/`.
17+
18+
## Run the Crawler
19+
20+
To run a (simulated) crawl, first create a yaml configuration file under `configs/`, and run the following command:
21+
22+
```bash
23+
python crawl.py crawl --config <path_to_your_config_file>
24+
```
25+
26+
### Crawl4LLM
27+
28+
Create a yaml file in `configs/` with the following content:
29+
30+
```yaml
31+
cw22_root_path: <path_to_clueweb22_a>
32+
seed_docs_file: seed.txt
33+
output_dir: crawl_results/seed_10k_crawl_20m_dclm_fasttext
34+
num_selected_docs_per_iter: 10000
35+
num_workers: 16 # set to a number that fits your machine
36+
save_state_every: -1 # set to a positive number to save the state (queue & visited set) of the crawler every certain steps
37+
max_num_docs: 20000000
38+
selection_method: dclm_fasttext_score
39+
order: desc # desc for descending, asc for ascending
40+
wandb: true # set to false to disable wandb logging
41+
wandb_project: crawler
42+
wandb_run_name: seed_10k_crawl_20m_dclm_fasttext
43+
rating_methods:
44+
-
45+
type: length
46+
-
47+
type: fasttext_score
48+
rater_name: dclm_fasttext_score
49+
model_path: fasttext_scorers/openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train.bin
50+
```
51+
52+
Documents are scored by all scorers in `rating_methods`. In the above configuration file, we set a `length` scorer, which scores a document by its length, and a `fasttext_score` scorer which uses the DCLM fastText model to score a document. The final ranking is determined by `selection_method` which is set to `dclm_fasttext_score`, the name of the `fasttext_score` scorer.
53+
54+
### Baseline Crawlers
55+
56+
#### Random Crawler
57+
58+
```yaml
59+
cw22_root_path: <path_to_clueweb22_a>
60+
seed_docs_file: seed.txt
61+
output_dir: crawl_results/seed_10k_crawl_20m_random
62+
num_selected_docs_per_iter: 10000
63+
num_workers: 16
64+
save_state_every: -1
65+
max_num_docs: 20000000
66+
selection_method: random_score
67+
order: desc
68+
wandb: true
69+
wandb_project: crawler
70+
wandb_run_name: seed_10k_crawl_20m_random
71+
rating_methods:
72+
-
73+
type: random_score
74+
```
75+
76+
#### Indegree-based Crawler
77+
78+
```yaml
79+
cw22_root_path: <path_to_clueweb22_a>
80+
seed_docs_file: seed.txt
81+
output_dir: crawl_results/seed_10k_crawl_20m_indegree
82+
num_selected_docs_per_iter: 10000
83+
num_workers: 16
84+
save_state_every: -1
85+
max_num_docs: 20000000
86+
selection_method: inlink_count
87+
order: desc
88+
wandb: true
89+
wandb_project: crawler
90+
wandb_run_name: seed_10k_crawl_20m_indegree
91+
rating_methods:
92+
-
93+
type: inlink_count
94+
```
95+

access_data.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import logging
2+
from corpus_interface import ClueWeb22Api, UnifiedGetter
3+
4+
logger = logging.getLogger(__name__)
5+
6+
7+
def main():
8+
logging.basicConfig(
9+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
10+
datefmt="%m/%d/%Y %H:%M:%S",
11+
level=logging.INFO,
12+
)
13+
cw22 = UnifiedGetter(
14+
ClueWeb22Api("/bos/tmp6/ClueWeb22_A"), docid_pos=5
15+
) # path to ClueWeb22_A on Boston
16+
doc_content = cw22.get_doc("clueweb22-en0045-44-19547")
17+
print(doc_content)
18+
outlinks = cw22.get_outlinks("clueweb22-en0045-44-19547")
19+
print(outlinks)
20+
for outlink in outlinks:
21+
if doc := cw22.get_doc(outlink):
22+
print(f"outlink doc {outlink} found")
23+
print(doc)
24+
else:
25+
print(f"outlink doc {outlink} not found")
26+
27+
28+
if __name__ == "__main__":
29+
main()

corpus_interface.py

+132
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import gzip
2+
import json
3+
import logging
4+
import os
5+
from dataclasses import dataclass, field
6+
from typing import Literal
7+
8+
logger = logging.getLogger(__name__)
9+
10+
11+
class DocumentAnnotation(dict):
12+
_compare_key: str | None = None
13+
_order: str = "desc"
14+
15+
def __init__(self, *args, **kwargs):
16+
super().__init__(*args, **kwargs)
17+
18+
@classmethod
19+
def set_compare_method(cls, key: str, order: Literal["desc", "asc"]) -> None:
20+
cls._compare_key = key
21+
if order not in ["desc", "asc"]:
22+
raise ValueError("Order must be either 'desc' (descending) or 'asc' (ascending)")
23+
cls._order = order
24+
25+
@classmethod
26+
def get_compare_key(cls) -> str | None:
27+
return cls._compare_key
28+
29+
def __lt__(self, other) -> bool:
30+
if self._compare_key is None:
31+
raise ValueError("Compare key not set")
32+
if self._order == "desc":
33+
return self[self._compare_key] > other[self._compare_key]
34+
return self[self._compare_key] < other[self._compare_key]
35+
36+
37+
@dataclass
38+
class Document:
39+
docid: str
40+
text: str | None = None
41+
annotations: DocumentAnnotation = field(default_factory=DocumentAnnotation)
42+
43+
44+
class ClueWeb22Api:
45+
# Modified from https://github.com/lemurproject/ClueWeb22/blob/main/ClueWeb22Api.py
46+
def __init__(self, cw22root_path) -> None:
47+
self.cw22root_path = cw22root_path
48+
49+
def get_base_filename_by_id(self, cw22id: str, file_type: str = "html") -> str:
50+
html_path = self.cw22root_path + os.sep + file_type
51+
id_parts = cw22id.split("-")
52+
53+
language = id_parts[1][:2]
54+
segment = id_parts[1][:4]
55+
directory = id_parts[1]
56+
base_path = html_path + os.sep + language + os.sep + segment + os.sep + directory + os.sep
57+
base_filename = base_path + id_parts[1] + "-" + id_parts[2]
58+
return base_filename
59+
60+
def get_json_record(self, cw22id: str, record_type: str) -> str:
61+
base_filename = self.get_base_filename_by_id(cw22id, file_type=record_type)
62+
63+
id_parts = cw22id.split("-")
64+
doc = int(id_parts[len(id_parts) - 1])
65+
66+
offset_length = len("{:010d}\n".format(0, 0))
67+
offset_path = base_filename + ".offset"
68+
json_path = base_filename + ".json.gz"
69+
with open(json_path, "rb") as f_json:
70+
with open(offset_path, "r") as f_offset:
71+
f_offset.seek(int(doc) * int(offset_length))
72+
start_bytes = int(f_offset.read(offset_length).strip())
73+
end_bytes = int(f_offset.read(offset_length).strip())
74+
f_json.seek(start_bytes)
75+
record = f_json.read(end_bytes - start_bytes)
76+
record = gzip.decompress(record).decode("utf-8")
77+
return record
78+
79+
def get_clean_text(self, cw22id: str) -> str:
80+
record = self.get_json_record(cw22id, "txt")
81+
return record
82+
83+
def get_inlinks(self, cw22id: str) -> str:
84+
record = self.get_json_record(cw22id, "inlink")
85+
return record
86+
87+
def get_outlinks(self, cw22id: str) -> str:
88+
record = self.get_json_record(cw22id, "outlink")
89+
return record
90+
91+
92+
class UnifiedGetter:
93+
def __init__(self, cw22_api: ClueWeb22Api, docid_pos: int = 0) -> None:
94+
self.cw22_api = cw22_api
95+
self.docid_pos = docid_pos
96+
97+
def get_doc(self, docid: str) -> Document | None:
98+
try:
99+
cw22_data = json.loads(self.cw22_api.get_clean_text(docid))
100+
except:
101+
logger.debug(f"Failed to get doc: {docid}") # Too many documents not found
102+
return None
103+
assert cw22_data["ClueWeb22-ID"] == docid
104+
return Document(docid=docid, text=cw22_data["Clean-Text"])
105+
106+
def get_outlinks(self, docid: str) -> list[str]:
107+
try:
108+
obj = json.loads(self.cw22_api.get_outlinks(docid))
109+
except: # File not found or empty entry
110+
logger.info(f"Failed to get outlinks for doc: {docid}")
111+
return []
112+
assert obj["ClueWeb22-ID"] == docid
113+
return [
114+
x[self.docid_pos]
115+
for x in obj["outlinks"]
116+
if x[self.docid_pos] is not None
117+
and x[self.docid_pos].startswith(f"clueweb22-en0") # Only keep CW22-A outlinks
118+
]
119+
120+
def get_inlinks(self, docid: str) -> list[str]:
121+
try:
122+
obj = json.loads(self.cw22_api.get_inlinks(docid))
123+
except:
124+
logger.debug(f"Failed to get inlinks for doc: {docid}")
125+
return []
126+
assert obj["ClueWeb22-ID"] == docid
127+
return [
128+
x[self.docid_pos]
129+
for x in obj["anchors"]
130+
if x[self.docid_pos] is not None
131+
and x[self.docid_pos].startswith(f"clueweb22-en0") # Only keep CW22-A inlinks
132+
]

0 commit comments

Comments
 (0)