cxcscmu
diff --git a/‎.gitignore
+4 b/‎.gitignore
+4
diff --git a/‎README.md
+95-1 b/‎README.md
+95-1
diff --git a/‎access_data.py
+29 b/‎access_data.py
+29
diff --git a/‎corpus_interface.py
+132 b/‎corpus_interface.py
+132
@@ -169,3 +169,7 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+
+fasttext_scorers/
+configs/
+crawl_results/
@@ -1 +1,95 @@
-# Crawl4LLM
+# Crawl4LLM
+
+This repo contains the code for the paper "Crawl4LLM: Efficient Web Crawling for LLM Pretraining".
+
+## Prerequisite
+
+1. [Request the ClueWeb22 dataset](https://lemurproject.org/clueweb22/).
+2. Create a virtual environment with python >= 3.10 and install the following requirements:
+```
+numpy
+tqdm
+fasttext
+pyyaml
+wandb
+```
+3. [Download the DCLM fastText classifier](https://huggingface.co/mlfoundations/fasttext-oh-eli5/tree/main) to `fasttext_scorers/`.
+
+## Run the Crawler
+
+To run a (simulated) crawl, first create a yaml configuration file under `configs/`, and run the following command:
+
+```bash
+python crawl.py crawl --config <path_to_your_config_file>
+```
+
+### Crawl4LLM
+
+Create a yaml file in `configs/` with the following content:
+
+```yaml
+cw22_root_path: <path_to_clueweb22_a>
+seed_docs_file: seed.txt
+output_dir: crawl_results/seed_10k_crawl_20m_dclm_fasttext
+num_selected_docs_per_iter: 10000
+num_workers: 16  # set to a number that fits your machine
+save_state_every: -1  # set to a positive number to save the state (queue & visited set) of the crawler every certain steps
+max_num_docs: 20000000
+selection_method: dclm_fasttext_score
+order: desc  # desc for descending, asc for ascending
+wandb: true  # set to false to disable wandb logging
+wandb_project: crawler
+wandb_run_name: seed_10k_crawl_20m_dclm_fasttext
+rating_methods:
+    - 
+        type: length
+    - 
+        type: fasttext_score
+        rater_name: dclm_fasttext_score
+        model_path: fasttext_scorers/openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train.bin
+```
+
+Documents are scored by all scorers in `rating_methods`. In the above configuration file, we set a `length` scorer, which scores a document by its length, and a `fasttext_score` scorer which uses the DCLM fastText model to score a document. The final ranking is determined by `selection_method` which is set to `dclm_fasttext_score`, the name of the `fasttext_score` scorer.
+
+### Baseline Crawlers
+
+#### Random Crawler
+
+```yaml
+cw22_root_path: <path_to_clueweb22_a>
+seed_docs_file: seed.txt
+output_dir: crawl_results/seed_10k_crawl_20m_random
+num_selected_docs_per_iter: 10000
+num_workers: 16
+save_state_every: -1
+max_num_docs: 20000000
+selection_method: random_score
+order: desc
+wandb: true
+wandb_project: crawler
+wandb_run_name: seed_10k_crawl_20m_random
+rating_methods:
+    - 
+        type: random_score
+```
+
+#### Indegree-based Crawler
+
+```yaml
+cw22_root_path: <path_to_clueweb22_a>
+seed_docs_file: seed.txt
+output_dir: crawl_results/seed_10k_crawl_20m_indegree
+num_selected_docs_per_iter: 10000
+num_workers: 16
+save_state_every: -1
+max_num_docs: 20000000
+selection_method: inlink_count
+order: desc
+wandb: true
+wandb_project: crawler
+wandb_run_name: seed_10k_crawl_20m_indegree
+rating_methods:
+    - 
+        type: inlink_count
+```
+
@@ -0,0 +1,29 @@
+import logging
+from corpus_interface import ClueWeb22Api, UnifiedGetter
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    cw22 = UnifiedGetter(
+        ClueWeb22Api("/bos/tmp6/ClueWeb22_A"), docid_pos=5
+    )  # path to ClueWeb22_A on Boston
+    doc_content = cw22.get_doc("clueweb22-en0045-44-19547")
+    print(doc_content)
+    outlinks = cw22.get_outlinks("clueweb22-en0045-44-19547")
+    print(outlinks)
+    for outlink in outlinks:
+        if doc := cw22.get_doc(outlink):
+            print(f"outlink doc {outlink} found")
+            print(doc)
+        else:
+            print(f"outlink doc {outlink} not found")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,132 @@
+import gzip
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Literal
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentAnnotation(dict):
+    _compare_key: str | None = None
+    _order: str = "desc"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def set_compare_method(cls, key: str, order: Literal["desc", "asc"]) -> None:
+        cls._compare_key = key
+        if order not in ["desc", "asc"]:
+            raise ValueError("Order must be either 'desc' (descending) or 'asc' (ascending)")
+        cls._order = order
+
+    @classmethod
+    def get_compare_key(cls) -> str | None:
+        return cls._compare_key
+
+    def __lt__(self, other) -> bool:
+        if self._compare_key is None:
+            raise ValueError("Compare key not set")
+        if self._order == "desc":
+            return self[self._compare_key] > other[self._compare_key]
+        return self[self._compare_key] < other[self._compare_key]
+
+
+@dataclass
+class Document:
+    docid: str
+    text: str | None = None
+    annotations: DocumentAnnotation = field(default_factory=DocumentAnnotation)
+
+
+class ClueWeb22Api:
+    # Modified from https://github.com/lemurproject/ClueWeb22/blob/main/ClueWeb22Api.py
+    def __init__(self, cw22root_path) -> None:
+        self.cw22root_path = cw22root_path
+
+    def get_base_filename_by_id(self, cw22id: str, file_type: str = "html") -> str:
+        html_path = self.cw22root_path + os.sep + file_type
+        id_parts = cw22id.split("-")
+
+        language = id_parts[1][:2]
+        segment = id_parts[1][:4]
+        directory = id_parts[1]
+        base_path = html_path + os.sep + language + os.sep + segment + os.sep + directory + os.sep
+        base_filename = base_path + id_parts[1] + "-" + id_parts[2]
+        return base_filename
+
+    def get_json_record(self, cw22id: str, record_type: str) -> str:
+        base_filename = self.get_base_filename_by_id(cw22id, file_type=record_type)
+
+        id_parts = cw22id.split("-")
+        doc = int(id_parts[len(id_parts) - 1])
+
+        offset_length = len("{:010d}\n".format(0, 0))
+        offset_path = base_filename + ".offset"
+        json_path = base_filename + ".json.gz"
+        with open(json_path, "rb") as f_json:
+            with open(offset_path, "r") as f_offset:
+                f_offset.seek(int(doc) * int(offset_length))
+                start_bytes = int(f_offset.read(offset_length).strip())
+                end_bytes = int(f_offset.read(offset_length).strip())
+                f_json.seek(start_bytes)
+                record = f_json.read(end_bytes - start_bytes)
+                record = gzip.decompress(record).decode("utf-8")
+                return record
+
+    def get_clean_text(self, cw22id: str) -> str:
+        record = self.get_json_record(cw22id, "txt")
+        return record
+
+    def get_inlinks(self, cw22id: str) -> str:
+        record = self.get_json_record(cw22id, "inlink")
+        return record
+
+    def get_outlinks(self, cw22id: str) -> str:
+        record = self.get_json_record(cw22id, "outlink")
+        return record
+
+
+class UnifiedGetter:
+    def __init__(self, cw22_api: ClueWeb22Api, docid_pos: int = 0) -> None:
+        self.cw22_api = cw22_api
+        self.docid_pos = docid_pos
+
+    def get_doc(self, docid: str) -> Document | None:
+        try:
+            cw22_data = json.loads(self.cw22_api.get_clean_text(docid))
+        except:
+            logger.debug(f"Failed to get doc: {docid}")  # Too many documents not found
+            return None
+        assert cw22_data["ClueWeb22-ID"] == docid
+        return Document(docid=docid, text=cw22_data["Clean-Text"])
+
+    def get_outlinks(self, docid: str) -> list[str]:
+        try:
+            obj = json.loads(self.cw22_api.get_outlinks(docid))
+        except:  # File not found or empty entry
+            logger.info(f"Failed to get outlinks for doc: {docid}")
+            return []
+        assert obj["ClueWeb22-ID"] == docid
+        return [
+            x[self.docid_pos]
+            for x in obj["outlinks"]
+            if x[self.docid_pos] is not None
+            and x[self.docid_pos].startswith(f"clueweb22-en0")  # Only keep CW22-A outlinks
+        ]
+
+    def get_inlinks(self, docid: str) -> list[str]:
+        try:
+            obj = json.loads(self.cw22_api.get_inlinks(docid))
+        except:
+            logger.debug(f"Failed to get inlinks for doc: {docid}")
+            return []
+        assert obj["ClueWeb22-ID"] == docid
+        return [
+            x[self.docid_pos]
+            for x in obj["anchors"]
+            if x[self.docid_pos] is not None
+            and x[self.docid_pos].startswith(f"clueweb22-en0")  # Only keep CW22-A inlinks
+        ]