Skip to content

Commit 644cd6b

Browse files
Merge pull request #941 from voetberg/315-more-tape-metadata
CMSDM-315: Include additional metadata in the tape colocation plugin
2 parents cc2a714 + 32f6a98 commit 644cd6b

File tree

1 file changed

+125
-63
lines changed

1 file changed

+125
-63
lines changed

src/policy/CMSRucioPolicy/algorithms/tape_colocation.py

Lines changed: 125 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,29 @@
22
Tape colocation algorithm for placement of CMS data on tape
33
'''
44

5+
import json
6+
import logging
57
from typing import Any, Optional, Union
8+
69
from rucio.common.exception import DataIdentifierNotFound
7-
from rucio.transfertool.fts3_plugins import FTS3TapeMetadataPlugin
8-
from rucio.core.did import list_parent_dids, get_did
9-
from rucio.db.sqla.constants import DIDType
1010
from rucio.common.types import InternalScope
11-
import logging
12-
import json
11+
from rucio.core.did import get_did, list_content, list_parent_dids
12+
from rucio.db.sqla.constants import DIDType
13+
from rucio.transfertool.fts3_plugins import FTS3TapeMetadataPlugin
1314

1415
logger = logging.getLogger(__name__)
1516

16-
class CMSTapeColocation(FTS3TapeMetadataPlugin):
17-
policy_algorithm = "tape_colocation"
17+
class CMSTapeColocation(FTS3TapeMetadataPlugin):
18+
policy_algorithm = "tape_metadata"
1819

20+
# Logic for tape colocation
1921
allowed_types = ['data', 'hidata', 'mc', 'himc', 'relval', 'hirelval']
2022
parking_name = "parking"
2123
raw_name = "raw"
2224
hiraw_name = "hiraw"
25+
26+
# Schema version as of June 26, 2025
27+
schema_version = 1
2328

2429
def __init__(self, policy: Union[str, None] = None) -> None:
2530

@@ -32,8 +37,8 @@ def _module_init_(cls) -> None:
3237
logger.info("Registered plugin %s", cls.policy_algorithm)
3338
cls.register(
3439
cls.policy_algorithm,
35-
func=lambda x: cls.cms_colocation(x)
36-
)
40+
func=lambda x: cls.tape_metadata(x)
41+
)
3742

3843
@staticmethod
3944
def _encode(name: Any) -> Optional[str]:
@@ -44,33 +49,33 @@ def _encode(name: Any) -> Optional[str]:
4449
return None
4550

4651
@staticmethod
47-
def parent_container(name):
52+
def parent_container(scope, name):
4853
# Custom logic for CMS
4954
# If dataset - look for the parent container
5055
# If file - look for the parent dataset and then the parent container
51-
scope = InternalScope("cms")
52-
try:
56+
scope = InternalScope(scope)
57+
try:
5358
is_file = get_did(scope=scope, name=name)['type'] == DIDType.FILE
5459
except DataIdentifierNotFound:
5560
logger.warning("DID not found for %s:%s", scope, name)
5661
return None
57-
try:
62+
try:
5863
if is_file:
5964
parent_dataset = [parent
60-
for parent
61-
in list_parent_dids(scope=scope, name=name)
65+
for parent
66+
in list_parent_dids(scope=scope, name=name)
6267
][0]
6368
containers = [
6469
parent['name']
65-
for parent
66-
in list_parent_dids(scope=scope, name=parent_dataset['name'])
70+
for parent
71+
in list_parent_dids(scope=scope, name=parent_dataset['name'])
6772
if parent['type'] == DIDType.CONTAINER
6873
]
6974
else:
7075
containers = [
7176
parent['name']
72-
for parent
73-
in list_parent_dids(scope=scope, name=name)
77+
for parent
78+
in list_parent_dids(scope=scope, name=name)
7479
if parent['type']==DIDType.CONTAINER
7580
]
7681
container = CMSTapeColocation._encode(containers[0])
@@ -82,104 +87,161 @@ def parent_container(name):
8287
logger.debug("No parent container found for %s:%s", scope, name)
8388

8489
@staticmethod
85-
def _is_raw(name):
90+
def _is_raw(name):
8691
# Raw always contains "RAW" in the name
8792
return any(i=="RAW" for i in name.split('/'))
8893

8994
@staticmethod
9095
def _is_parking(name):
9196
# Parking is denoted by having ParkingXXXX in the lfn
92-
try:
97+
try:
9398
return any(n.startswith("Parking") for n in name.split('/'))
9499
except IndexError:
95100
return False
96101

97102
@staticmethod
98-
def data_type(name):
103+
def data_type(name):
99104
data_type = name.removeprefix('/store/').split('/')[0] # First index that isn't `store`
100-
101105
# Custom logic: Use parking or raw over "data", use hiraw if heavy ion and raw
102-
if data_type not in CMSTapeColocation.allowed_types:
106+
if data_type not in CMSTapeColocation.allowed_types:
103107
data_type = "n/a"
104-
elif CMSTapeColocation._is_parking(name):
108+
elif CMSTapeColocation._is_parking(name):
105109
data_type = CMSTapeColocation.parking_name
106110
elif CMSTapeColocation._is_raw(name):
107-
if data_type.startswith("hi"):
111+
if data_type.startswith("hi"):
108112
data_type = CMSTapeColocation.hiraw_name
109-
else:
113+
else:
110114
data_type = CMSTapeColocation.raw_name
111115

112116
return data_type
113117

114118
@staticmethod
115-
def data_tier(name):
116-
try:
119+
def data_tier(name):
120+
try:
117121
tier = name.removeprefix('/store/').split('/')[3]
118122
if CMSTapeColocation._encode(tier) is None:
119123
logger.debug("Could not encode data tier for %s", name)
120124
return tier
121-
except IndexError:
125+
except IndexError:
122126
logger.debug("Could not determine data tier for %s", name)
123127

124128
@staticmethod
125-
def era(name):
126-
try:
129+
def era(name):
130+
try:
127131
era = name.removeprefix('/store/').split('/')[1]
128132
if CMSTapeColocation._encode(era) is None:
129133
logger.debug("Could not encode era for %s", name)
130134
return era
131-
except IndexError:
135+
except IndexError:
132136
logger.debug("Could not determine era for %s", name)
133137

138+
@staticmethod
139+
def _get_container_stats(scope, name):
140+
size = 0
141+
length = 0
142+
scope = InternalScope(scope)
143+
try:
144+
contents = list_content(scope, name)
145+
for item in contents:
146+
if item['type'] == DIDType.FILE:
147+
size += item.get('bytes', 0)
148+
length += 1
149+
elif item['type'] in [DIDType.DATASET, DIDType.CONTAINER]:
150+
# Recursively get size of nested datasets/containers
151+
sub_length, sub_size = CMSTapeColocation._get_container_stats(item['name'])
152+
length += sub_length
153+
size += sub_size
154+
except DataIdentifierNotFound:
155+
logger.warning("DID not found for container %s:%s", scope, name)
156+
157+
return length, size
158+
134159
@classmethod
135-
def cms_colocation(cls, hints):
160+
def tape_metadata(cls, hints):
136161
"""
137162
https://github.com/dmwm/CMSRucio/issues/753
138163
https://github.com/dmwm/CMSRucio/issues/323
139-
140-
Level 0
141-
Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
142-
143-
Level 1
144-
Data tier - either in the LFN or the end of the parent container
145-
146-
Level 2
147-
Era (which for MC is the Campaign)
148-
149-
Level 3
150-
Parent Container (parent container of dataset if file)
151-
164+
https://its.cern.ch/jira/browse/CMSDM-315
152165
153-
Examples:
154-
* Parking data:
155-
/store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
156-
--> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
157-
* Raw:
158-
/store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
159-
--> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
160-
166+
Tape Colocation:
167+
Level 0
168+
Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
169+
170+
Level 1
171+
Data tier - either in the LFN or the end of the parent container
172+
173+
Level 2
174+
Era (which for MC is the Campaign)
175+
176+
Level 3
177+
Parent Container (parent container of dataset if file)
178+
179+
180+
Examples:
181+
* Parking data:
182+
/store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
183+
--> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
184+
* Raw:
185+
/store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
186+
--> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
187+
188+
File Metadata:
189+
As given by Rucio hints:
190+
* Size
191+
* MD5
192+
* Adler32
193+
194+
Additional Hints:
195+
* Activity (default to "default" if not given)
196+
* Level 3 length and size (if parent container exists)
197+
- Length - total number of files in the parent container, including nested datasets/containers
198+
- Size - total size of all files in the parent container, including nested datasets/containers
199+
161200
"""
162201

163202
lfn = hints['name']
164203
data_type = cls.data_type(lfn)
165-
166204
colocation = {
167205
"0": data_type,
168206
}
169207

170208
if data_type != "n/a":
171209
tier = cls.data_tier(lfn)
172210
era = cls.era(lfn)
173-
parent = cls.parent_container(hints['name'])
174-
if tier is not None:
211+
parent = cls.parent_container(hints['scope'], hints['name'])
212+
if tier is not None:
175213
colocation['1'] = tier
176-
if era is not None:
214+
if era is not None:
177215
colocation['2'] = era
178-
if parent is not None:
216+
if parent is not None:
179217
colocation['3'] = parent
180-
else:
218+
else:
219+
parent = None
181220
logger.debug("Could not determine data type for %s", lfn)
182221

183-
# TODO Speak with FTS3 Team about these headers
184222
logger.debug("Setting colocation hints %s", colocation)
185-
return {"collocation_hints": colocation}
223+
224+
additional_hints = {
225+
"activity": hints.get("activity", "default"),
226+
}
227+
if parent is not None:
228+
length, size = CMSTapeColocation._get_container_stats(hints['scope'], parent)
229+
additional_hints['3'] = {
230+
"length": length, #The number of files in the parent container
231+
"size": size, #The total size of the parent container
232+
}
233+
logger.debug("Setting additional hints %s", additional_hints)
234+
235+
metadata = {
236+
"size": hints['metadata'].get('filesize', 0), # File size
237+
"md5": hints['metadata'].get("md5"), # MD5 checksum
238+
"adler32": hints['metadata'].get("adler32"), # Adler32 checksum
239+
}
240+
logger.debug("File metadata: %s", metadata)
241+
242+
return {
243+
"collocation_hints": colocation,
244+
"additional_hints": additional_hints,
245+
"file_metadata": metadata,
246+
"schema_version": cls.schema_version
247+
}

0 commit comments

Comments
 (0)