|
| 1 | +import gzip |
| 2 | +import json |
| 3 | +import logging |
| 4 | +import os |
| 5 | +from dataclasses import dataclass, field |
| 6 | +from typing import Literal |
| 7 | + |
| 8 | +logger = logging.getLogger(__name__) |
| 9 | + |
| 10 | + |
| 11 | +class DocumentAnnotation(dict): |
| 12 | + _compare_key: str | None = None |
| 13 | + _order: str = "desc" |
| 14 | + |
| 15 | + def __init__(self, *args, **kwargs): |
| 16 | + super().__init__(*args, **kwargs) |
| 17 | + |
| 18 | + @classmethod |
| 19 | + def set_compare_method(cls, key: str, order: Literal["desc", "asc"]) -> None: |
| 20 | + cls._compare_key = key |
| 21 | + if order not in ["desc", "asc"]: |
| 22 | + raise ValueError("Order must be either 'desc' (descending) or 'asc' (ascending)") |
| 23 | + cls._order = order |
| 24 | + |
| 25 | + @classmethod |
| 26 | + def get_compare_key(cls) -> str | None: |
| 27 | + return cls._compare_key |
| 28 | + |
| 29 | + def __lt__(self, other) -> bool: |
| 30 | + if self._compare_key is None: |
| 31 | + raise ValueError("Compare key not set") |
| 32 | + if self._order == "desc": |
| 33 | + return self[self._compare_key] > other[self._compare_key] |
| 34 | + return self[self._compare_key] < other[self._compare_key] |
| 35 | + |
| 36 | + |
| 37 | +@dataclass |
| 38 | +class Document: |
| 39 | + docid: str |
| 40 | + text: str | None = None |
| 41 | + annotations: DocumentAnnotation = field(default_factory=DocumentAnnotation) |
| 42 | + |
| 43 | + |
| 44 | +class ClueWeb22Api: |
| 45 | + # Modified from https://github.com/lemurproject/ClueWeb22/blob/main/ClueWeb22Api.py |
| 46 | + def __init__(self, cw22root_path) -> None: |
| 47 | + self.cw22root_path = cw22root_path |
| 48 | + |
| 49 | + def get_base_filename_by_id(self, cw22id: str, file_type: str = "html") -> str: |
| 50 | + html_path = self.cw22root_path + os.sep + file_type |
| 51 | + id_parts = cw22id.split("-") |
| 52 | + |
| 53 | + language = id_parts[1][:2] |
| 54 | + segment = id_parts[1][:4] |
| 55 | + directory = id_parts[1] |
| 56 | + base_path = html_path + os.sep + language + os.sep + segment + os.sep + directory + os.sep |
| 57 | + base_filename = base_path + id_parts[1] + "-" + id_parts[2] |
| 58 | + return base_filename |
| 59 | + |
| 60 | + def get_json_record(self, cw22id: str, record_type: str) -> str: |
| 61 | + base_filename = self.get_base_filename_by_id(cw22id, file_type=record_type) |
| 62 | + |
| 63 | + id_parts = cw22id.split("-") |
| 64 | + doc = int(id_parts[len(id_parts) - 1]) |
| 65 | + |
| 66 | + offset_length = len("{:010d}\n".format(0, 0)) |
| 67 | + offset_path = base_filename + ".offset" |
| 68 | + json_path = base_filename + ".json.gz" |
| 69 | + with open(json_path, "rb") as f_json: |
| 70 | + with open(offset_path, "r") as f_offset: |
| 71 | + f_offset.seek(int(doc) * int(offset_length)) |
| 72 | + start_bytes = int(f_offset.read(offset_length).strip()) |
| 73 | + end_bytes = int(f_offset.read(offset_length).strip()) |
| 74 | + f_json.seek(start_bytes) |
| 75 | + record = f_json.read(end_bytes - start_bytes) |
| 76 | + record = gzip.decompress(record).decode("utf-8") |
| 77 | + return record |
| 78 | + |
| 79 | + def get_clean_text(self, cw22id: str) -> str: |
| 80 | + record = self.get_json_record(cw22id, "txt") |
| 81 | + return record |
| 82 | + |
| 83 | + def get_inlinks(self, cw22id: str) -> str: |
| 84 | + record = self.get_json_record(cw22id, "inlink") |
| 85 | + return record |
| 86 | + |
| 87 | + def get_outlinks(self, cw22id: str) -> str: |
| 88 | + record = self.get_json_record(cw22id, "outlink") |
| 89 | + return record |
| 90 | + |
| 91 | + |
| 92 | +class UnifiedGetter: |
| 93 | + def __init__(self, cw22_api: ClueWeb22Api, docid_pos: int = 0) -> None: |
| 94 | + self.cw22_api = cw22_api |
| 95 | + self.docid_pos = docid_pos |
| 96 | + |
| 97 | + def get_doc(self, docid: str) -> Document | None: |
| 98 | + try: |
| 99 | + cw22_data = json.loads(self.cw22_api.get_clean_text(docid)) |
| 100 | + except: |
| 101 | + logger.debug(f"Failed to get doc: {docid}") # Too many documents not found |
| 102 | + return None |
| 103 | + assert cw22_data["ClueWeb22-ID"] == docid |
| 104 | + return Document(docid=docid, text=cw22_data["Clean-Text"]) |
| 105 | + |
| 106 | + def get_outlinks(self, docid: str) -> list[str]: |
| 107 | + try: |
| 108 | + obj = json.loads(self.cw22_api.get_outlinks(docid)) |
| 109 | + except: # File not found or empty entry |
| 110 | + logger.info(f"Failed to get outlinks for doc: {docid}") |
| 111 | + return [] |
| 112 | + assert obj["ClueWeb22-ID"] == docid |
| 113 | + return [ |
| 114 | + x[self.docid_pos] |
| 115 | + for x in obj["outlinks"] |
| 116 | + if x[self.docid_pos] is not None |
| 117 | + and x[self.docid_pos].startswith(f"clueweb22-en0") # Only keep CW22-A outlinks |
| 118 | + ] |
| 119 | + |
| 120 | + def get_inlinks(self, docid: str) -> list[str]: |
| 121 | + try: |
| 122 | + obj = json.loads(self.cw22_api.get_inlinks(docid)) |
| 123 | + except: |
| 124 | + logger.debug(f"Failed to get inlinks for doc: {docid}") |
| 125 | + return [] |
| 126 | + assert obj["ClueWeb22-ID"] == docid |
| 127 | + return [ |
| 128 | + x[self.docid_pos] |
| 129 | + for x in obj["anchors"] |
| 130 | + if x[self.docid_pos] is not None |
| 131 | + and x[self.docid_pos].startswith(f"clueweb22-en0") # Only keep CW22-A inlinks |
| 132 | + ] |
0 commit comments