22Tape colocation algorithm for placement of CMS data on tape
33'''
44
5+ import json
6+ import logging
57from typing import Any , Optional , Union
8+
69from rucio .common .exception import DataIdentifierNotFound
7- from rucio .transfertool .fts3_plugins import FTS3TapeMetadataPlugin
8- from rucio .core .did import list_parent_dids , get_did
9- from rucio .db .sqla .constants import DIDType
1010from rucio .common .types import InternalScope
11- import logging
12- import json
11+ from rucio .core .did import get_did , list_content , list_parent_dids
12+ from rucio .db .sqla .constants import DIDType
13+ from rucio .transfertool .fts3_plugins import FTS3TapeMetadataPlugin
1314
1415logger = logging .getLogger (__name__ )
1516
16- class CMSTapeColocation (FTS3TapeMetadataPlugin ):
17- policy_algorithm = "tape_colocation "
17+ class CMSTapeColocation (FTS3TapeMetadataPlugin ):
18+ policy_algorithm = "tape_metadata "
1819
20+ # Logic for tape colocation
1921 allowed_types = ['data' , 'hidata' , 'mc' , 'himc' , 'relval' , 'hirelval' ]
2022 parking_name = "parking"
2123 raw_name = "raw"
2224 hiraw_name = "hiraw"
25+
26+ # Schema version as of June 26, 2025
27+ schema_version = 1
2328
2429 def __init__ (self , policy : Union [str , None ] = None ) -> None :
2530
@@ -32,8 +37,8 @@ def _module_init_(cls) -> None:
3237 logger .info ("Registered plugin %s" , cls .policy_algorithm )
3338 cls .register (
3439 cls .policy_algorithm ,
35- func = lambda x : cls .cms_colocation (x )
36- )
40+ func = lambda x : cls .tape_metadata (x )
41+ )
3742
3843 @staticmethod
3944 def _encode (name : Any ) -> Optional [str ]:
@@ -44,33 +49,33 @@ def _encode(name: Any) -> Optional[str]:
4449 return None
4550
4651 @staticmethod
47- def parent_container (name ):
52+ def parent_container (scope , name ):
4853 # Custom logic for CMS
4954 # If dataset - look for the parent container
5055 # If file - look for the parent dataset and then the parent container
51- scope = InternalScope ("cms" )
52- try :
56+ scope = InternalScope (scope )
57+ try :
5358 is_file = get_did (scope = scope , name = name )['type' ] == DIDType .FILE
5459 except DataIdentifierNotFound :
5560 logger .warning ("DID not found for %s:%s" , scope , name )
5661 return None
57- try :
62+ try :
5863 if is_file :
5964 parent_dataset = [parent
60- for parent
61- in list_parent_dids (scope = scope , name = name )
65+ for parent
66+ in list_parent_dids (scope = scope , name = name )
6267 ][0 ]
6368 containers = [
6469 parent ['name' ]
65- for parent
66- in list_parent_dids (scope = scope , name = parent_dataset ['name' ])
70+ for parent
71+ in list_parent_dids (scope = scope , name = parent_dataset ['name' ])
6772 if parent ['type' ] == DIDType .CONTAINER
6873 ]
6974 else :
7075 containers = [
7176 parent ['name' ]
72- for parent
73- in list_parent_dids (scope = scope , name = name )
77+ for parent
78+ in list_parent_dids (scope = scope , name = name )
7479 if parent ['type' ]== DIDType .CONTAINER
7580 ]
7681 container = CMSTapeColocation ._encode (containers [0 ])
@@ -82,104 +87,161 @@ def parent_container(name):
8287 logger .debug ("No parent container found for %s:%s" , scope , name )
8388
8489 @staticmethod
85- def _is_raw (name ):
90+ def _is_raw (name ):
8691 # Raw always contains "RAW" in the name
8792 return any (i == "RAW" for i in name .split ('/' ))
8893
8994 @staticmethod
9095 def _is_parking (name ):
9196 # Parking is denoted by having ParkingXXXX in the lfn
92- try :
97+ try :
9398 return any (n .startswith ("Parking" ) for n in name .split ('/' ))
9499 except IndexError :
95100 return False
96101
97102 @staticmethod
98- def data_type (name ):
103+ def data_type (name ):
99104 data_type = name .removeprefix ('/store/' ).split ('/' )[0 ] # First index that isn't `store`
100-
101105 # Custom logic: Use parking or raw over "data", use hiraw if heavy ion and raw
102- if data_type not in CMSTapeColocation .allowed_types :
106+ if data_type not in CMSTapeColocation .allowed_types :
103107 data_type = "n/a"
104- elif CMSTapeColocation ._is_parking (name ):
108+ elif CMSTapeColocation ._is_parking (name ):
105109 data_type = CMSTapeColocation .parking_name
106110 elif CMSTapeColocation ._is_raw (name ):
107- if data_type .startswith ("hi" ):
111+ if data_type .startswith ("hi" ):
108112 data_type = CMSTapeColocation .hiraw_name
109- else :
113+ else :
110114 data_type = CMSTapeColocation .raw_name
111115
112116 return data_type
113117
114118 @staticmethod
115- def data_tier (name ):
116- try :
119+ def data_tier (name ):
120+ try :
117121 tier = name .removeprefix ('/store/' ).split ('/' )[3 ]
118122 if CMSTapeColocation ._encode (tier ) is None :
119123 logger .debug ("Could not encode data tier for %s" , name )
120124 return tier
121- except IndexError :
125+ except IndexError :
122126 logger .debug ("Could not determine data tier for %s" , name )
123127
124128 @staticmethod
125- def era (name ):
126- try :
129+ def era (name ):
130+ try :
127131 era = name .removeprefix ('/store/' ).split ('/' )[1 ]
128132 if CMSTapeColocation ._encode (era ) is None :
129133 logger .debug ("Could not encode era for %s" , name )
130134 return era
131- except IndexError :
135+ except IndexError :
132136 logger .debug ("Could not determine era for %s" , name )
133137
138+ @staticmethod
139+ def _get_container_stats (scope , name ):
140+ size = 0
141+ length = 0
142+ scope = InternalScope (scope )
143+ try :
144+ contents = list_content (scope , name )
145+ for item in contents :
146+ if item ['type' ] == DIDType .FILE :
147+ size += item .get ('bytes' , 0 )
148+ length += 1
149+ elif item ['type' ] in [DIDType .DATASET , DIDType .CONTAINER ]:
150+ # Recursively get size of nested datasets/containers
151+ sub_length , sub_size = CMSTapeColocation ._get_container_stats (item ['name' ])
152+ length += sub_length
153+ size += sub_size
154+ except DataIdentifierNotFound :
155+ logger .warning ("DID not found for container %s:%s" , scope , name )
156+
157+ return length , size
158+
134159 @classmethod
135- def cms_colocation (cls , hints ):
160+ def tape_metadata (cls , hints ):
136161 """
137162 https://github.com/dmwm/CMSRucio/issues/753
138163 https://github.com/dmwm/CMSRucio/issues/323
139-
140- Level 0
141- Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
142-
143- Level 1
144- Data tier - either in the LFN or the end of the parent container
145-
146- Level 2
147- Era (which for MC is the Campaign)
148-
149- Level 3
150- Parent Container (parent container of dataset if file)
151-
164+ https://its.cern.ch/jira/browse/CMSDM-315
152165
153- Examples:
154- * Parking data:
155- /store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
156- --> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
157- * Raw:
158- /store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
159- --> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
160-
166+ Tape Colocation:
167+ Level 0
168+ Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
169+
170+ Level 1
171+ Data tier - either in the LFN or the end of the parent container
172+
173+ Level 2
174+ Era (which for MC is the Campaign)
175+
176+ Level 3
177+ Parent Container (parent container of dataset if file)
178+
179+
180+ Examples:
181+ * Parking data:
182+ /store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
183+ --> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
184+ * Raw:
185+ /store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
186+ --> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
187+
188+ File Metadata:
189+ As given by Rucio hints:
190+ * Size
191+ * MD5
192+ * Adler32
193+
194+ Additional Hints:
195+ * Activity (default to "default" if not given)
196+ * Level 3 length and size (if parent container exists)
197+ - Length - total number of files in the parent container, including nested datasets/containers
198+ - Size - total size of all files in the parent container, including nested datasets/containers
199+
161200 """
162201
163202 lfn = hints ['name' ]
164203 data_type = cls .data_type (lfn )
165-
166204 colocation = {
167205 "0" : data_type ,
168206 }
169207
170208 if data_type != "n/a" :
171209 tier = cls .data_tier (lfn )
172210 era = cls .era (lfn )
173- parent = cls .parent_container (hints ['name' ])
174- if tier is not None :
211+ parent = cls .parent_container (hints ['scope' ], hints [ ' name' ])
212+ if tier is not None :
175213 colocation ['1' ] = tier
176- if era is not None :
214+ if era is not None :
177215 colocation ['2' ] = era
178- if parent is not None :
216+ if parent is not None :
179217 colocation ['3' ] = parent
180- else :
218+ else :
219+ parent = None
181220 logger .debug ("Could not determine data type for %s" , lfn )
182221
183- # TODO Speak with FTS3 Team about these headers
184222 logger .debug ("Setting colocation hints %s" , colocation )
185- return {"collocation_hints" : colocation }
223+
224+ additional_hints = {
225+ "activity" : hints .get ("activity" , "default" ),
226+ }
227+ if parent is not None :
228+ length , size = CMSTapeColocation ._get_container_stats (hints ['scope' ], parent )
229+ additional_hints ['3' ] = {
230+ "length" : length , #The number of files in the parent container
231+ "size" : size , #The total size of the parent container
232+ }
233+ logger .debug ("Setting additional hints %s" , additional_hints )
234+
235+ metadata = {
236+ "size" : hints ['metadata' ].get ('filesize' , 0 ), # File size
237+ "md5" : hints ['metadata' ].get ("md5" ), # MD5 checksum
238+ "adler32" : hints ['metadata' ].get ("adler32" ), # Adler32 checksum
239+ }
240+ logger .debug ("File metadata: %s" , metadata )
241+
242+ return {
243+ "collocation_hints" : colocation ,
244+ "additional_hints" : additional_hints ,
245+ "file_metadata" : metadata ,
246+ "schema_version" : cls .schema_version
247+ }
0 commit comments