22Tape colocation algorithm for placement of CMS data on tape
33'''
44
5- import json
6- import logging
75from typing import Any , Optional , Union
8-
96from rucio .common .exception import DataIdentifierNotFound
10- from rucio .common .types import InternalScope
11- from rucio .core .did import get_did , list_content , list_parent_dids
12- from rucio .db .sqla .constants import DIDType
137from rucio .transfertool .fts3_plugins import FTS3TapeMetadataPlugin
8+ from rucio .core .did import list_parent_dids , get_did
9+ from rucio .db .sqla .constants import DIDType
10+ from rucio .common .types import InternalScope
11+ import logging
12+ import json
1413
1514logger = logging .getLogger (__name__ )
1615
17- class CMSTapeColocation (FTS3TapeMetadataPlugin ):
18- policy_algorithm = "tape_metadata "
16+ class CMSTapeColocation (FTS3TapeMetadataPlugin ):
17+ policy_algorithm = "tape_colocation "
1918
20- # Logic for tape colocation
2119 allowed_types = ['data' , 'hidata' , 'mc' , 'himc' , 'relval' , 'hirelval' ]
2220 parking_name = "parking"
2321 raw_name = "raw"
2422 hiraw_name = "hiraw"
25-
26- # Schema version as of June 26, 2025
27- schema_version = 1
2823
2924 def __init__ (self , policy : Union [str , None ] = None ) -> None :
3025
@@ -37,8 +32,8 @@ def _module_init_(cls) -> None:
3732 logger .info ("Registered plugin %s" , cls .policy_algorithm )
3833 cls .register (
3934 cls .policy_algorithm ,
40- func = lambda x : cls .tape_metadata (x )
41- )
35+ func = lambda x : cls .cms_colocation (x )
36+ )
4237
4338 @staticmethod
4439 def _encode (name : Any ) -> Optional [str ]:
@@ -49,33 +44,33 @@ def _encode(name: Any) -> Optional[str]:
4944 return None
5045
5146 @staticmethod
52- def parent_container (scope , name ):
47+ def parent_container (name ):
5348 # Custom logic for CMS
5449 # If dataset - look for the parent container
5550 # If file - look for the parent dataset and then the parent container
56- scope = InternalScope (scope )
57- try :
51+ scope = InternalScope ("cms" )
52+ try :
5853 is_file = get_did (scope = scope , name = name )['type' ] == DIDType .FILE
5954 except DataIdentifierNotFound :
6055 logger .warning ("DID not found for %s:%s" , scope , name )
6156 return None
62- try :
57+ try :
6358 if is_file :
6459 parent_dataset = [parent
65- for parent
66- in list_parent_dids (scope = scope , name = name )
60+ for parent
61+ in list_parent_dids (scope = scope , name = name )
6762 ][0 ]
6863 containers = [
6964 parent ['name' ]
70- for parent
71- in list_parent_dids (scope = scope , name = parent_dataset ['name' ])
65+ for parent
66+ in list_parent_dids (scope = scope , name = parent_dataset ['name' ])
7267 if parent ['type' ] == DIDType .CONTAINER
7368 ]
7469 else :
7570 containers = [
7671 parent ['name' ]
77- for parent
78- in list_parent_dids (scope = scope , name = name )
72+ for parent
73+ in list_parent_dids (scope = scope , name = name )
7974 if parent ['type' ]== DIDType .CONTAINER
8075 ]
8176 container = CMSTapeColocation ._encode (containers [0 ])
@@ -87,161 +82,104 @@ def parent_container(scope, name):
8782 logger .debug ("No parent container found for %s:%s" , scope , name )
8883
8984 @staticmethod
90- def _is_raw (name ):
85+ def _is_raw (name ):
9186 # Raw always contains "RAW" in the name
9287 return any (i == "RAW" for i in name .split ('/' ))
9388
9489 @staticmethod
9590 def _is_parking (name ):
9691 # Parking is denoted by having ParkingXXXX in the lfn
97- try :
92+ try :
9893 return any (n .startswith ("Parking" ) for n in name .split ('/' ))
9994 except IndexError :
10095 return False
10196
10297 @staticmethod
103- def data_type (name ):
98+ def data_type (name ):
10499 data_type = name .removeprefix ('/store/' ).split ('/' )[0 ] # First index that isn't `store`
100+
105101 # Custom logic: Use parking or raw over "data", use hiraw if heavy ion and raw
106- if data_type not in CMSTapeColocation .allowed_types :
102+ if data_type not in CMSTapeColocation .allowed_types :
107103 data_type = "n/a"
108- elif CMSTapeColocation ._is_parking (name ):
104+ elif CMSTapeColocation ._is_parking (name ):
109105 data_type = CMSTapeColocation .parking_name
110106 elif CMSTapeColocation ._is_raw (name ):
111- if data_type .startswith ("hi" ):
107+ if data_type .startswith ("hi" ):
112108 data_type = CMSTapeColocation .hiraw_name
113- else :
109+ else :
114110 data_type = CMSTapeColocation .raw_name
115111
116112 return data_type
117113
118114 @staticmethod
119- def data_tier (name ):
120- try :
115+ def data_tier (name ):
116+ try :
121117 tier = name .removeprefix ('/store/' ).split ('/' )[3 ]
122118 if CMSTapeColocation ._encode (tier ) is None :
123119 logger .debug ("Could not encode data tier for %s" , name )
124120 return tier
125- except IndexError :
121+ except IndexError :
126122 logger .debug ("Could not determine data tier for %s" , name )
127123
128124 @staticmethod
129- def era (name ):
130- try :
125+ def era (name ):
126+ try :
131127 era = name .removeprefix ('/store/' ).split ('/' )[1 ]
132128 if CMSTapeColocation ._encode (era ) is None :
133129 logger .debug ("Could not encode era for %s" , name )
134130 return era
135- except IndexError :
131+ except IndexError :
136132 logger .debug ("Could not determine era for %s" , name )
137133
138- @staticmethod
139- def _get_container_stats (scope , name ):
140- size = 0
141- length = 0
142- scope = InternalScope (scope )
143- try :
144- contents = list_content (scope , name )
145- for item in contents :
146- if item ['type' ] == DIDType .FILE :
147- size += item .get ('bytes' , 0 )
148- length += 1
149- elif item ['type' ] in [DIDType .DATASET , DIDType .CONTAINER ]:
150- # Recursively get size of nested datasets/containers
151- sub_length , sub_size = CMSTapeColocation ._get_container_stats (item ['name' ])
152- length += sub_length
153- size += sub_size
154- except DataIdentifierNotFound :
155- logger .warning ("DID not found for container %s:%s" , scope , name )
156-
157- return length , size
158-
159134 @classmethod
160- def tape_metadata (cls , hints ):
135+ def cms_colocation (cls , hints ):
161136 """
162137 https://github.com/dmwm/CMSRucio/issues/753
163138 https://github.com/dmwm/CMSRucio/issues/323
164- https://its.cern.ch/jira/browse/CMSDM-315
139+
140+ Level 0
141+ Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
142+
143+ Level 1
144+ Data tier - either in the LFN or the end of the parent container
145+
146+ Level 2
147+ Era (which for MC is the Campaign)
148+
149+ Level 3
150+ Parent Container (parent container of dataset if file)
151+
165152
166- Tape Colocation:
167- Level 0
168- Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
169-
170- Level 1
171- Data tier - either in the LFN or the end of the parent container
172-
173- Level 2
174- Era (which for MC is the Campaign)
175-
176- Level 3
177- Parent Container (parent container of dataset if file)
178-
179-
180- Examples:
181- * Parking data:
182- /store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
183- --> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
184- * Raw:
185- /store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
186- --> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
187-
188- File Metadata:
189- As given by Rucio hints:
190- * Size
191- * MD5
192- * Adler32
193-
194- Additional Hints:
195- * Activity (default to "default" if not given)
196- * Level 3 length and size (if parent container exists)
197- - Length - total number of files in the parent container, including nested datasets/containers
198- - Size - total size of all files in the parent container, including nested datasets/containers
199-
153+ Examples:
154+ * Parking data:
155+ /store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
156+ --> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
157+ * Raw:
158+ /store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
159+ --> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
160+
200161 """
201162
202163 lfn = hints ['name' ]
203164 data_type = cls .data_type (lfn )
165+
204166 colocation = {
205167 "0" : data_type ,
206168 }
207169
208170 if data_type != "n/a" :
209171 tier = cls .data_tier (lfn )
210172 era = cls .era (lfn )
211- parent = cls .parent_container (hints ['scope' ], hints [ ' name' ])
212- if tier is not None :
173+ parent = cls .parent_container (hints ['name' ])
174+ if tier is not None :
213175 colocation ['1' ] = tier
214- if era is not None :
176+ if era is not None :
215177 colocation ['2' ] = era
216- if parent is not None :
178+ if parent is not None :
217179 colocation ['3' ] = parent
218- else :
219- parent = None
180+ else :
220181 logger .debug ("Could not determine data type for %s" , lfn )
221182
183+ # TODO Speak with FTS3 Team about these headers
222184 logger .debug ("Setting colocation hints %s" , colocation )
223-
224- additional_hints = {
225- "activity" : hints .get ("activity" , "default" ),
226- }
227- if parent is not None :
228- length , size = CMSTapeColocation ._get_container_stats (hints ['scope' ], parent )
229- additional_hints ['3' ] = {
230- "length" : length , #The number of files in the parent container
231- "size" : size , #The total size of the parent container
232- }
233- logger .debug ("Setting additional hints %s" , additional_hints )
234-
235- metadata = {
236- "size" : hints ['metadata' ].get ('filesize' , 0 ), # File size
237- "md5" : hints ['metadata' ].get ("md5" ), # MD5 checksum
238- "adler32" : hints ['metadata' ].get ("adler32" ), # Adler32 checksum
239- }
240- logger .debug ("File metadata: %s" , metadata )
241-
242- return {
243- "collocation_hints" : colocation ,
244- "additional_hints" : additional_hints ,
245- "file_metadata" : metadata ,
246- "schema_version" : cls .schema_version
247- }
185+ return {"collocation_hints" : colocation }
0 commit comments