22Tape colocation algorithm for placement of CMS data on tape
33'''
44
5+ import json
6+ import logging
57from typing import Any , Optional , Union
8+
69from rucio .common .exception import DataIdentifierNotFound
7- from rucio .transfertool .fts3_plugins import FTS3TapeMetadataPlugin
8- from rucio .core .did import list_parent_dids , get_did
9- from rucio .db .sqla .constants import DIDType
1010from rucio .common .types import InternalScope
11- import logging
12- import json
11+ from rucio .core .did import get_did , list_content , list_parent_dids
12+ from rucio .db .sqla .constants import DIDType
13+ from rucio .transfertool .fts3_plugins import FTS3TapeMetadataPlugin
1314
1415logger = logging .getLogger (__name__ )
1516
16- class CMSTapeColocation (FTS3TapeMetadataPlugin ):
17- policy_algorithm = "tape_colocation "
17+ class CMSTapeColocation (FTS3TapeMetadataPlugin ):
18+ policy_algorithm = "tape_metadata "
1819
20+ # Logic for tape colocation
1921 allowed_types = ['data' , 'hidata' , 'mc' , 'himc' , 'relval' , 'hirelval' ]
2022 parking_name = "parking"
2123 raw_name = "raw"
2224 hiraw_name = "hiraw"
25+
26+ # Schema version as of June 26, 2025
27+ schema_version = 1
2328
2429 def __init__ (self , policy : Union [str , None ] = None ) -> None :
2530
@@ -32,8 +37,8 @@ def _module_init_(cls) -> None:
3237 logger .info ("Registered plugin %s" , cls .policy_algorithm )
3338 cls .register (
3439 cls .policy_algorithm ,
35- func = lambda x : cls .cms_colocation (x )
36- )
40+ func = lambda x : cls .tape_metadata (x )
41+ )
3742
3843 @staticmethod
3944 def _encode (name : Any ) -> Optional [str ]:
@@ -44,33 +49,34 @@ def _encode(name: Any) -> Optional[str]:
4449 return None
4550
4651 @staticmethod
47- def parent_container (name ):
52+ def parent_container (scope , name ):
4853 # Custom logic for CMS
4954 # If dataset - look for the parent container
5055 # If file - look for the parent dataset and then the parent container
51- scope = InternalScope ("cms" )
52- try :
56+ if not isinstance (scope , InternalScope ):
57+ scope = InternalScope (scope )
58+ try :
5359 is_file = get_did (scope = scope , name = name )['type' ] == DIDType .FILE
5460 except DataIdentifierNotFound :
5561 logger .warning ("DID not found for %s:%s" , scope , name )
5662 return None
57- try :
63+ try :
5864 if is_file :
5965 parent_dataset = [parent
60- for parent
61- in list_parent_dids (scope = scope , name = name )
66+ for parent
67+ in list_parent_dids (scope = scope , name = name )
6268 ][0 ]
6369 containers = [
6470 parent ['name' ]
65- for parent
66- in list_parent_dids (scope = scope , name = parent_dataset ['name' ])
71+ for parent
72+ in list_parent_dids (scope = scope , name = parent_dataset ['name' ])
6773 if parent ['type' ] == DIDType .CONTAINER
6874 ]
6975 else :
7076 containers = [
7177 parent ['name' ]
72- for parent
73- in list_parent_dids (scope = scope , name = name )
78+ for parent
79+ in list_parent_dids (scope = scope , name = name )
7480 if parent ['type' ]== DIDType .CONTAINER
7581 ]
7682 container = CMSTapeColocation ._encode (containers [0 ])
@@ -82,104 +88,162 @@ def parent_container(name):
8288 logger .debug ("No parent container found for %s:%s" , scope , name )
8389
8490 @staticmethod
85- def _is_raw (name ):
91+ def _is_raw (name ):
8692 # Raw always contains "RAW" in the name
8793 return any (i == "RAW" for i in name .split ('/' ))
8894
8995 @staticmethod
9096 def _is_parking (name ):
9197 # Parking is denoted by having ParkingXXXX in the lfn
92- try :
98+ try :
9399 return any (n .startswith ("Parking" ) for n in name .split ('/' ))
94100 except IndexError :
95101 return False
96102
97103 @staticmethod
98- def data_type (name ):
104+ def data_type (name ):
99105 data_type = name .removeprefix ('/store/' ).split ('/' )[0 ] # First index that isn't `store`
100-
101106 # Custom logic: Use parking or raw over "data", use hiraw if heavy ion and raw
102- if data_type not in CMSTapeColocation .allowed_types :
107+ if data_type not in CMSTapeColocation .allowed_types :
103108 data_type = "n/a"
104- elif CMSTapeColocation ._is_parking (name ):
109+ elif CMSTapeColocation ._is_parking (name ):
105110 data_type = CMSTapeColocation .parking_name
106111 elif CMSTapeColocation ._is_raw (name ):
107- if data_type .startswith ("hi" ):
112+ if data_type .startswith ("hi" ):
108113 data_type = CMSTapeColocation .hiraw_name
109- else :
114+ else :
110115 data_type = CMSTapeColocation .raw_name
111116
112117 return data_type
113118
114119 @staticmethod
115- def data_tier (name ):
116- try :
120+ def data_tier (name ):
121+ try :
117122 tier = name .removeprefix ('/store/' ).split ('/' )[3 ]
118123 if CMSTapeColocation ._encode (tier ) is None :
119124 logger .debug ("Could not encode data tier for %s" , name )
120125 return tier
121- except IndexError :
126+ except IndexError :
122127 logger .debug ("Could not determine data tier for %s" , name )
123128
124129 @staticmethod
125- def era (name ):
126- try :
130+ def era (name ):
131+ try :
127132 era = name .removeprefix ('/store/' ).split ('/' )[1 ]
128133 if CMSTapeColocation ._encode (era ) is None :
129134 logger .debug ("Could not encode era for %s" , name )
130135 return era
131- except IndexError :
136+ except IndexError :
132137 logger .debug ("Could not determine era for %s" , name )
133138
139+ @staticmethod
140+ def _get_container_stats (scope , name ):
141+ size = 0
142+ length = 0
143+ if not isinstance (scope , InternalScope ):
144+ scope = InternalScope (scope )
145+ try :
146+ contents = list_content (scope , name )
147+ for item in contents :
148+ if item ['type' ] == DIDType .FILE :
149+ size += item .get ('bytes' , 0 )
150+ length += 1
151+ elif item ['type' ] in [DIDType .DATASET , DIDType .CONTAINER ]:
152+ # Recursively get size of nested datasets/containers
153+ sub_length , sub_size = CMSTapeColocation ._get_container_stats (item ['name' ])
154+ length += sub_length
155+ size += sub_size
156+ except DataIdentifierNotFound :
157+ logger .warning ("DID not found for container %s:%s" , scope , name )
158+
159+ return length , size
160+
134161 @classmethod
135- def cms_colocation (cls , hints ):
162+ def tape_metadata (cls , hints ):
136163 """
137164 https://github.com/dmwm/CMSRucio/issues/753
138165 https://github.com/dmwm/CMSRucio/issues/323
139-
140- Level 0
141- Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
142-
143- Level 1
144- Data tier - either in the LFN or the end of the parent container
145-
146- Level 2
147- Era (which for MC is the Campaign)
148-
149- Level 3
150- Parent Container (parent container of dataset if file)
151-
166+ https://its.cern.ch/jira/browse/CMSDM-315
152167
153- Examples:
154- * Parking data:
155- /store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
156- --> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
157- * Raw:
158- /store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
159- --> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
160-
168+ Tape Colocation:
169+ Level 0
170+ Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
171+
172+ Level 1
173+ Data tier - either in the LFN or the end of the parent container
174+
175+ Level 2
176+ Era (which for MC is the Campaign)
177+
178+ Level 3
179+ Parent Container (parent container of dataset if file)
180+
181+
182+ Examples:
183+ * Parking data:
184+ /store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
185+ --> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
186+ * Raw:
187+ /store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
188+ --> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
189+
190+ File Metadata:
191+ As given by Rucio hints:
192+ * Size
193+ * MD5
194+ * Adler32
195+
196+ Additional Hints:
197+ * Activity (default to "default" if not given)
198+ * Level 3 length and size (if parent container exists)
199+ - Length - total number of files in the parent container, including nested datasets/containers
200+ - Size - total size of all files in the parent container, including nested datasets/containers
201+
161202 """
162203
163204 lfn = hints ['name' ]
164205 data_type = cls .data_type (lfn )
165-
166206 colocation = {
167207 "0" : data_type ,
168208 }
169209
170210 if data_type != "n/a" :
171211 tier = cls .data_tier (lfn )
172212 era = cls .era (lfn )
173- parent = cls .parent_container (hints ['name' ])
174- if tier is not None :
213+ parent = cls .parent_container (hints ['scope' ], hints [ ' name' ])
214+ if tier is not None :
175215 colocation ['1' ] = tier
176- if era is not None :
216+ if era is not None :
177217 colocation ['2' ] = era
178- if parent is not None :
218+ if parent is not None :
179219 colocation ['3' ] = parent
180- else :
220+ else :
221+ parent = None
181222 logger .debug ("Could not determine data type for %s" , lfn )
182223
183- # TODO Speak with FTS3 Team about these headers
184224 logger .debug ("Setting colocation hints %s" , colocation )
185- return {"collocation_hints" : colocation }
225+
226+ additional_hints = {
227+ "activity" : hints .get ("activity" , "default" ),
228+ }
229+ if parent is not None :
230+ length , size = CMSTapeColocation ._get_container_stats (hints ['scope' ], parent )
231+ additional_hints ['3' ] = {
232+ "length" : length , #The number of files in the parent container
233+ "size" : size , #The total size of the parent container
234+ }
235+ logger .debug ("Setting additional hints %s" , additional_hints )
236+
237+ metadata = {
238+ "size" : hints ['metadata' ].get ('filesize' , 0 ), # File size
239+ "md5" : hints ['metadata' ].get ("md5" ), # MD5 checksum
240+ "adler32" : hints ['metadata' ].get ("adler32" ), # Adler32 checksum
241+ }
242+ logger .debug ("File metadata: %s" , metadata )
243+
244+ return {
245+ "collocation_hints" : colocation ,
246+ "additional_hints" : additional_hints ,
247+ "file_metadata" : metadata ,
248+ "schema_version" : cls .schema_version
249+ }
0 commit comments