Skip to content

Commit 73d06d8

Browse files
Merge pull request #959 from ericvaandering/revert_ext_md
Revert "CMSDM-315: Include additional metadata in the tape colocation plugin"
2 parents 3247450 + a16ffcd commit 73d06d8

File tree

1 file changed

+63
-125
lines changed

1 file changed

+63
-125
lines changed

src/policy/CMSRucioPolicy/algorithms/tape_colocation.py

Lines changed: 63 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,24 @@
22
Tape colocation algorithm for placement of CMS data on tape
33
'''
44

5-
import json
6-
import logging
75
from typing import Any, Optional, Union
8-
96
from rucio.common.exception import DataIdentifierNotFound
10-
from rucio.common.types import InternalScope
11-
from rucio.core.did import get_did, list_content, list_parent_dids
12-
from rucio.db.sqla.constants import DIDType
137
from rucio.transfertool.fts3_plugins import FTS3TapeMetadataPlugin
8+
from rucio.core.did import list_parent_dids, get_did
9+
from rucio.db.sqla.constants import DIDType
10+
from rucio.common.types import InternalScope
11+
import logging
12+
import json
1413

1514
logger = logging.getLogger(__name__)
1615

17-
class CMSTapeColocation(FTS3TapeMetadataPlugin):
18-
policy_algorithm = "tape_metadata"
16+
class CMSTapeColocation(FTS3TapeMetadataPlugin):
17+
policy_algorithm = "tape_colocation"
1918

20-
# Logic for tape colocation
2119
allowed_types = ['data', 'hidata', 'mc', 'himc', 'relval', 'hirelval']
2220
parking_name = "parking"
2321
raw_name = "raw"
2422
hiraw_name = "hiraw"
25-
26-
# Schema version as of June 26, 2025
27-
schema_version = 1
2823

2924
def __init__(self, policy: Union[str, None] = None) -> None:
3025

@@ -37,8 +32,8 @@ def _module_init_(cls) -> None:
3732
logger.info("Registered plugin %s", cls.policy_algorithm)
3833
cls.register(
3934
cls.policy_algorithm,
40-
func=lambda x: cls.tape_metadata(x)
41-
)
35+
func=lambda x: cls.cms_colocation(x)
36+
)
4237

4338
@staticmethod
4439
def _encode(name: Any) -> Optional[str]:
@@ -49,33 +44,33 @@ def _encode(name: Any) -> Optional[str]:
4944
return None
5045

5146
@staticmethod
52-
def parent_container(scope, name):
47+
def parent_container(name):
5348
# Custom logic for CMS
5449
# If dataset - look for the parent container
5550
# If file - look for the parent dataset and then the parent container
56-
scope = InternalScope(scope)
57-
try:
51+
scope = InternalScope("cms")
52+
try:
5853
is_file = get_did(scope=scope, name=name)['type'] == DIDType.FILE
5954
except DataIdentifierNotFound:
6055
logger.warning("DID not found for %s:%s", scope, name)
6156
return None
62-
try:
57+
try:
6358
if is_file:
6459
parent_dataset = [parent
65-
for parent
66-
in list_parent_dids(scope=scope, name=name)
60+
for parent
61+
in list_parent_dids(scope=scope, name=name)
6762
][0]
6863
containers = [
6964
parent['name']
70-
for parent
71-
in list_parent_dids(scope=scope, name=parent_dataset['name'])
65+
for parent
66+
in list_parent_dids(scope=scope, name=parent_dataset['name'])
7267
if parent['type'] == DIDType.CONTAINER
7368
]
7469
else:
7570
containers = [
7671
parent['name']
77-
for parent
78-
in list_parent_dids(scope=scope, name=name)
72+
for parent
73+
in list_parent_dids(scope=scope, name=name)
7974
if parent['type']==DIDType.CONTAINER
8075
]
8176
container = CMSTapeColocation._encode(containers[0])
@@ -87,161 +82,104 @@ def parent_container(scope, name):
8782
logger.debug("No parent container found for %s:%s", scope, name)
8883

8984
@staticmethod
90-
def _is_raw(name):
85+
def _is_raw(name):
9186
# Raw always contains "RAW" in the name
9287
return any(i=="RAW" for i in name.split('/'))
9388

9489
@staticmethod
9590
def _is_parking(name):
9691
# Parking is denoted by having ParkingXXXX in the lfn
97-
try:
92+
try:
9893
return any(n.startswith("Parking") for n in name.split('/'))
9994
except IndexError:
10095
return False
10196

10297
@staticmethod
103-
def data_type(name):
98+
def data_type(name):
10499
data_type = name.removeprefix('/store/').split('/')[0] # First index that isn't `store`
100+
105101
# Custom logic: Use parking or raw over "data", use hiraw if heavy ion and raw
106-
if data_type not in CMSTapeColocation.allowed_types:
102+
if data_type not in CMSTapeColocation.allowed_types:
107103
data_type = "n/a"
108-
elif CMSTapeColocation._is_parking(name):
104+
elif CMSTapeColocation._is_parking(name):
109105
data_type = CMSTapeColocation.parking_name
110106
elif CMSTapeColocation._is_raw(name):
111-
if data_type.startswith("hi"):
107+
if data_type.startswith("hi"):
112108
data_type = CMSTapeColocation.hiraw_name
113-
else:
109+
else:
114110
data_type = CMSTapeColocation.raw_name
115111

116112
return data_type
117113

118114
@staticmethod
119-
def data_tier(name):
120-
try:
115+
def data_tier(name):
116+
try:
121117
tier = name.removeprefix('/store/').split('/')[3]
122118
if CMSTapeColocation._encode(tier) is None:
123119
logger.debug("Could not encode data tier for %s", name)
124120
return tier
125-
except IndexError:
121+
except IndexError:
126122
logger.debug("Could not determine data tier for %s", name)
127123

128124
@staticmethod
129-
def era(name):
130-
try:
125+
def era(name):
126+
try:
131127
era = name.removeprefix('/store/').split('/')[1]
132128
if CMSTapeColocation._encode(era) is None:
133129
logger.debug("Could not encode era for %s", name)
134130
return era
135-
except IndexError:
131+
except IndexError:
136132
logger.debug("Could not determine era for %s", name)
137133

138-
@staticmethod
139-
def _get_container_stats(scope, name):
140-
size = 0
141-
length = 0
142-
scope = InternalScope(scope)
143-
try:
144-
contents = list_content(scope, name)
145-
for item in contents:
146-
if item['type'] == DIDType.FILE:
147-
size += item.get('bytes', 0)
148-
length += 1
149-
elif item['type'] in [DIDType.DATASET, DIDType.CONTAINER]:
150-
# Recursively get size of nested datasets/containers
151-
sub_length, sub_size = CMSTapeColocation._get_container_stats(item['name'])
152-
length += sub_length
153-
size += sub_size
154-
except DataIdentifierNotFound:
155-
logger.warning("DID not found for container %s:%s", scope, name)
156-
157-
return length, size
158-
159134
@classmethod
160-
def tape_metadata(cls, hints):
135+
def cms_colocation(cls, hints):
161136
"""
162137
https://github.com/dmwm/CMSRucio/issues/753
163138
https://github.com/dmwm/CMSRucio/issues/323
164-
https://its.cern.ch/jira/browse/CMSDM-315
139+
140+
Level 0
141+
Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
142+
143+
Level 1
144+
Data tier - either in the LFN or the end of the parent container
145+
146+
Level 2
147+
Era (which for MC is the Campaign)
148+
149+
Level 3
150+
Parent Container (parent container of dataset if file)
151+
165152
166-
Tape Colocation:
167-
Level 0
168-
Data/MC/HIData/HiMC (from /store/(data/mc/hi/data/himc) plus RAW and HIRAW, and Parking.
169-
170-
Level 1
171-
Data tier - either in the LFN or the end of the parent container
172-
173-
Level 2
174-
Era (which for MC is the Campaign)
175-
176-
Level 3
177-
Parent Container (parent container of dataset if file)
178-
179-
180-
Examples:
181-
* Parking data:
182-
/store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
183-
--> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
184-
* Raw:
185-
/store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
186-
--> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
187-
188-
File Metadata:
189-
As given by Rucio hints:
190-
* Size
191-
* MD5
192-
* Adler32
193-
194-
Additional Hints:
195-
* Activity (default to "default" if not given)
196-
* Level 3 length and size (if parent container exists)
197-
- Length - total number of files in the parent container, including nested datasets/containers
198-
- Size - total size of all files in the parent container, including nested datasets/containers
199-
153+
Examples:
154+
* Parking data:
155+
/store/data/Run2024C/ParkingVBF5/RAW/v1/000/380/115/00000/b4c0513e-f732-42b1-858d-572c86ce4b97.root
156+
--> {'0': 'parking', '1': 'RAW', '2': 'Run2024C', '3': '/ParkingVBF5/Run2024C-v1/RAW'}
157+
* Raw:
158+
/store/hidata/HIRun2024B/HIPhysicsRawPrime5/RAW/v1/000/388/624/00000/fa0795b5-633b-461c-bc21-02d40a118dd2.root
159+
--> {'0': 'hiraw', '1': 'RAW', '2': 'HIRun2024B', '3': '/HIPhysicsRawPrime5/HIRun2024B-v1/RAW'}
160+
200161
"""
201162

202163
lfn = hints['name']
203164
data_type = cls.data_type(lfn)
165+
204166
colocation = {
205167
"0": data_type,
206168
}
207169

208170
if data_type != "n/a":
209171
tier = cls.data_tier(lfn)
210172
era = cls.era(lfn)
211-
parent = cls.parent_container(hints['scope'], hints['name'])
212-
if tier is not None:
173+
parent = cls.parent_container(hints['name'])
174+
if tier is not None:
213175
colocation['1'] = tier
214-
if era is not None:
176+
if era is not None:
215177
colocation['2'] = era
216-
if parent is not None:
178+
if parent is not None:
217179
colocation['3'] = parent
218-
else:
219-
parent = None
180+
else:
220181
logger.debug("Could not determine data type for %s", lfn)
221182

183+
# TODO Speak with FTS3 Team about these headers
222184
logger.debug("Setting colocation hints %s", colocation)
223-
224-
additional_hints = {
225-
"activity": hints.get("activity", "default"),
226-
}
227-
if parent is not None:
228-
length, size = CMSTapeColocation._get_container_stats(hints['scope'], parent)
229-
additional_hints['3'] = {
230-
"length": length, #The number of files in the parent container
231-
"size": size, #The total size of the parent container
232-
}
233-
logger.debug("Setting additional hints %s", additional_hints)
234-
235-
metadata = {
236-
"size": hints['metadata'].get('filesize', 0), # File size
237-
"md5": hints['metadata'].get("md5"), # MD5 checksum
238-
"adler32": hints['metadata'].get("adler32"), # Adler32 checksum
239-
}
240-
logger.debug("File metadata: %s", metadata)
241-
242-
return {
243-
"collocation_hints": colocation,
244-
"additional_hints": additional_hints,
245-
"file_metadata": metadata,
246-
"schema_version": cls.schema_version
247-
}
185+
return {"collocation_hints": colocation}

0 commit comments

Comments
 (0)