Skip to content

Commit 21cad2d

Browse files
committed
Policy with extra metadata, scope typing
1 parent 0ff3080 commit 21cad2d

File tree

1 file changed

+127
-63
lines changed

1 file changed

+127
-63
lines changed

src/policy/CMSRucioPolicy/algorithms/tape_colocation.py

Lines changed: 127 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,29 @@
22
Tape colocation algorithm for placement of CMS data on tape
33
'''
44

5+
import json
6+
import logging
57
from typing import Any, Optional, Union
8+
69
from rucio.common.exception import DataIdentifierNotFound
7-
from rucio.transfertool.fts3_plugins import FTS3TapeMetadataPlugin
8-
from rucio.core.did import list_parent_dids, get_did
9-
from rucio.db.sqla.constants import DIDType
1010
from rucio.common.types import InternalScope
11-
import logging
12-
import json
11+
from rucio.core.did import get_did, list_content, list_parent_dids
12+
from rucio.db.sqla.constants import DIDType
13+
from rucio.transfertool.fts3_plugins import FTS3TapeMetadataPlugin
1314

1415
logger = logging.getLogger(__name__)
1516

16-
class CMSTapeColocation(FTS3TapeMetadataPlugin):
17-
policy_algorithm = "tape_colocation"
17+
class CMSTapeColocation(FTS3TapeMetadataPlugin):
18+
policy_algorithm = "tape_metadata"
1819

20+
# Logic for tape colocation
1921
allowed_types = ['data', 'hidata', 'mc', 'himc', 'relval', 'hirelval']
2022
parking_name = "parking"
2123
raw_name = "raw"
2224
hiraw_name = "hiraw"
25+
26+
# Schema version as of June 26, 2025
27+
schema_version = 1
2328

2429
def __init__(self, policy: Union[str, None] = None) -> None:
2530

@@ -32,8 +37,8 @@ def _module_init_(cls) -> None:
3237
logger.info("Registered plugin %s", cls.policy_algorithm)
3338
cls.register(
3439
cls.policy_algorithm,
35-
func=lambda x: cls.cms_colocation(x)
36-
)
40+
func=lambda x: cls.tape_metadata(x)
41+
)
3742

3843
@staticmethod
3944
def _encode(name: Any) -> Optional[str]:
@@ -44,33 +49,34 @@ def _encode(name: Any) -> Optional[str]:
4449
return None
4550

4651
@staticmethod
47-
def parent_container(name):
52+
def parent_container(scope, name):
4853
# Custom logic for CMS
4954
# If dataset - look for the parent container
5055
# If file - look for the parent dataset and then the parent container
51-
scope = InternalScope("cms")
52-
try:
56+
if not isinstance(scope, InternalScope):
57+
scope = InternalScope(scope)
58+
try:
5359
is_file = get_did(scope=scope, name=name)['type'] == DIDType.FILE
5460
except DataIdentifierNotFound:
5561
logger.warning("DID not found for %s:%s", scope, name)
5662
return None
57-
try:
63+
try:
5864
if is_file:
5965
parent_dataset = [parent
60-
for parent
61-
in list_parent_dids(scope=scope, name=name)
66+
for parent
67+
in list_parent_dids(scope=scope, name=name)
6268
][0]
6369
containers = [
6470
parent['name']
65-
for parent
66-
in list_parent_dids(scope=scope, name=parent_dataset['name'])
71+
for parent
72+
in list_parent_dids(scope=scope, name=parent_dataset['name'])
6773
if parent['type'] == DIDType.CONTAINER
6874
]
6975
else:
7076
containers = [
7177
parent['name']
72-
for parent
73-
in list_parent_dids(scope=scope, name=name)
78+
for parent
79+
in list_parent_dids(scope=scope, name=name)
7480
if parent['type']==DIDType.CONTAINER
7581
]
7682
container = CMSTapeColocation._encode(containers[0])
@@ -82,104 +88,162 @@ def parent_container(name):
8288
logger.debug("No parent container found for %s:%s", scope, name)
8389

8490
@staticmethod
85-
def _is_raw(name):
91+
def _is_raw(name):
8692
# Raw always contains "RAW" in the name
8793
return any(i=="RAW" for i in name.split('/'))
8894

8995
@staticmethod
9096
def _is_parking(name):
9197
# Parking is denoted by having ParkingXXXX in the lfn
92-
try:
98+
try:
9399
return any(n.startswith("Parking") for n in name.split('/'))
94100
except IndexError:
95101
return False
96102

97103
@staticmethod
98-
def data_type(name):
104+
def data_type(name):
99105
data_type = name.removeprefix('/store/').split('/')[0] # First index that isn't `store`
100-
101106
# Custom logic: Use parking or raw over "data", use hiraw if heavy ion and raw
102-
if data_type not in CMSTapeColocation.allowed_types:
107+
if data_type not in CMSTapeColocation.allowed_types:
103108
data_type = "n/a"
104-
elif CMSTapeColocation._is_parking(name):
109+
elif CMSTapeColocation._is_parking(name):
105110
data_type = CMSTapeColocation.parking_name
106111
elif CMSTapeColocation._is_raw(name):
107-
if data_type.startswith("hi"):
112+
if data_type.startswith("hi"):
108113
data_type = CMSTapeColocation.hiraw_name
109-
else:
114+
else:
110115
data_type = CMSTapeColocation.raw_name
111116

112117
return data_type
113118

114119
@staticmethod
115-
def data_tier(name):
116-
try:
120+
def data_tier(name):
121+
try:
117122
tier = name.removeprefix('/store/').split('/')[3]
118123
if CMSTapeColocation._encode(tier) is None:
119124
logger.debug("Could not encode data tier for %s", name)
120125
return tier
121-
except IndexError:
126+
except IndexError:
122127
logger.debug("Could not determine data tier for %s", name)
123128

124129
@staticmethod
125-
def era(name):
126-
try:
130+
def era(name):
131+
try:
127132
era = name.removeprefix('/store/').split('/')[1]
128133
if CMSTapeColocation._encode(era) is None:
129134
logger.debug("Could not encode era for %s", name)
130135
return era
131-
except IndexError:
136+
except IndexError:
132137
logger.debug("Could not determine era for %s", name)
133138

139+
@staticmethod
140+
def _get_container_stats(scope, name):
141+
size = 0
142+
length = 0
143+
if not isinstance(scope, InternalScope):
144+
scope = InternalScope(scope)
145+
try:
146+
contents = list_content(scope, name)
147+
for item in contents:
148+
if item['type'] == DIDType.FILE:
149+
size += item.get('bytes', 0)
150+
length += 1
151+
elif item['type'] in [DIDType.DATASET, DIDType.CONTAINER]:
152+
# Recursively get size of nested datasets/containers
153+
sub_length, sub_size = CMSTapeColocation._get_container_stats(item['name'])
154+
length += sub_length
155+
size += sub_size
156+
except DataIdentifierNotFound:
157+
logger.warning("DID not found for container %s:%s", scope, name)
158+
159+
return length, size
160+
134161
@classmethod
135-
def cms_colocation(cls, hints):
162+
def tape_metadata(cls, hints):
136163
"""
137164
https://github.com/dmwm/CMSRucio/issues/753
138165
https://github.com/dmwm/CMSRucio/issues/323
139-
140-
Level 0
141-
Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
142-
143-
Level 1
144-
Data tier - either in the LFN or the end of the parent container
145-
146-
Level 2
147-
Era (which for MC is the Campaign)
148-
149-
Level 3
150-
Parent Container (parent container of dataset if file)
151-
166+
https://its.cern.ch/jira/browse/CMSDM-315
152167
153-
Examples:
154-
* Parking data:
155-
/store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
156-
--> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
157-
* Raw:
158-
/store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
159-
--> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
160-
168+
Tape Colocation:
169+
Level 0
170+
Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
171+
172+
Level 1
173+
Data tier - either in the LFN or the end of the parent container
174+
175+
Level 2
176+
Era (which for MC is the Campaign)
177+
178+
Level 3
179+
Parent Container (parent container of dataset if file)
180+
181+
182+
Examples:
183+
* Parking data:
184+
/store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
185+
--> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
186+
* Raw:
187+
/store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
188+
--> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
189+
190+
File Metadata:
191+
As given by Rucio hints:
192+
* Size
193+
* MD5
194+
* Adler32
195+
196+
Additional Hints:
197+
* Activity (default to "default" if not given)
198+
* Level 3 length and size (if parent container exists)
199+
- Length - total number of files in the parent container, including nested datasets/containers
200+
- Size - total size of all files in the parent container, including nested datasets/containers
201+
161202
"""
162203

163204
lfn = hints['name']
164205
data_type = cls.data_type(lfn)
165-
166206
colocation = {
167207
"0": data_type,
168208
}
169209

170210
if data_type != "n/a":
171211
tier = cls.data_tier(lfn)
172212
era = cls.era(lfn)
173-
parent = cls.parent_container(hints['name'])
174-
if tier is not None:
213+
parent = cls.parent_container(hints['scope'], hints['name'])
214+
if tier is not None:
175215
colocation['1'] = tier
176-
if era is not None:
216+
if era is not None:
177217
colocation['2'] = era
178-
if parent is not None:
218+
if parent is not None:
179219
colocation['3'] = parent
180-
else:
220+
else:
221+
parent = None
181222
logger.debug("Could not determine data type for %s", lfn)
182223

183-
# TODO Speak with FTS3 Team about these headers
184224
logger.debug("Setting colocation hints %s", colocation)
185-
return {"collocation_hints": colocation}
225+
226+
additional_hints = {
227+
"activity": hints.get("activity", "default"),
228+
}
229+
if parent is not None:
230+
length, size = CMSTapeColocation._get_container_stats(hints['scope'], parent)
231+
additional_hints['3'] = {
232+
"length": length, #The number of files in the parent container
233+
"size": size, #The total size of the parent container
234+
}
235+
logger.debug("Setting additional hints %s", additional_hints)
236+
237+
metadata = {
238+
"size": hints['metadata'].get('filesize', 0), # File size
239+
"md5": hints['metadata'].get("md5"), # MD5 checksum
240+
"adler32": hints['metadata'].get("adler32"), # Adler32 checksum
241+
}
242+
logger.debug("File metadata: %s", metadata)
243+
244+
return {
245+
"collocation_hints": colocation,
246+
"additional_hints": additional_hints,
247+
"file_metadata": metadata,
248+
"schema_version": cls.schema_version
249+
}

0 commit comments

Comments
 (0)