|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +"""Returns a list of files from a dataset including only files that are hosted on disk.""" |
| 4 | + |
| 5 | +import os,sys,getpass,warnings,glob,shlex,subprocess,argparse # pylint: disable=multiple-imports |
| 6 | +from collections import defaultdict |
| 7 | + |
| 8 | +def getOS(): |
| 9 | + """Gets OS version from shell (other methods return host OS when in container)""" |
| 10 | + cmd = r"sed -nr 's/[^0-9]*([0-9]+).*/\1/p' /etc/redhat-release" |
| 11 | + osv = subprocess.check_output(shlex.split(cmd), encoding="utf-8").rstrip() |
| 12 | + return osv |
| 13 | + |
| 14 | +def getHosted(dataset): |
| 15 | + """Gets list of files on disk for a dataset, and list of sites along with how many files each site has""" |
| 16 | + osv = getOS() |
| 17 | + rucio_path = f'/cvmfs/cms.cern.ch/rucio/x86_64/rhel{osv}/py3/current' |
| 18 | + os.environ['RUCIO_HOME'] = rucio_path |
| 19 | + os.environ['RUCIO_ACCOUNT'] = getpass.getuser() |
| 20 | + full_rucio_path = glob.glob(rucio_path+'/lib/python*.*')[0] |
| 21 | + sys.path.insert(0,full_rucio_path+'/site-packages/') |
| 22 | + |
| 23 | + warnings.filterwarnings("ignore", message=".*cryptography.*") |
| 24 | + from rucio.client.client import Client # pylint: disable=import-error,import-outside-toplevel |
| 25 | + client = Client() |
| 26 | + |
| 27 | + # loop over blocks to avoid timeout error from too-large response |
| 28 | + all_blocks = list(client.list_content(scope='cms',name=dataset)) |
| 29 | + # batch some blocks together for fewer requests |
| 30 | + # not fully optimized, but n=10 tested to be ~15% faster than n=1 |
| 31 | + nblocks = 10 |
| 32 | + block_groups = [all_blocks[i:i+nblocks] for i in range(0, len(all_blocks), nblocks)] |
| 33 | + |
| 34 | + from rucio.client.replicaclient import ReplicaClient # pylint: disable=import-error,import-outside-toplevel |
| 35 | + rep_client = ReplicaClient() |
| 36 | + |
| 37 | + filelist = set() |
| 38 | + sitelist = defaultdict(int) |
| 39 | + def sitecond(site): |
| 40 | + return "_Tape" not in site |
| 41 | + for block_group in block_groups: |
| 42 | + reps = list(rep_client.list_replicas([{'scope': 'cms', 'name': block['name']} for block in block_group])) |
| 43 | + for rep in reps: |
| 44 | + for site,state in rep['states'].items(): |
| 45 | + if state=='AVAILABLE' and sitecond(site): |
| 46 | + filelist.add(rep['name']) |
| 47 | + sitelist[site] += 1 |
| 48 | + |
| 49 | + sys.path.pop(0) |
| 50 | + return filelist, sitelist |
| 51 | + |
| 52 | +def main(dataset, outfile=None, verbose=False): |
| 53 | + """Prints file list and site list""" |
| 54 | + filelist, sitelist = getHosted(dataset) |
| 55 | + |
| 56 | + if verbose: |
| 57 | + print("Site list:") |
| 58 | + print("\n".join(f'{k}: {v}' for k,v in sitelist.items())) |
| 59 | + |
| 60 | + file = open(outfile,'w') if outfile is not None else sys.stdout # pylint: disable=consider-using-with,unspecified-encoding |
| 61 | + print("\n".join(filelist), file=file) |
| 62 | + if outfile is not None: file.close() # pylint: disable=multiple-statements |
| 63 | + |
| 64 | +if __name__=="__main__": |
| 65 | + parser = argparse.ArgumentParser( |
| 66 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| 67 | + description="Find all available files (those hosted on disk) for a given dataset", |
| 68 | + ) |
| 69 | + parser.add_argument("-o","--outfile",type=str,default=None,help="write to this file instead of stdout") |
| 70 | + parser.add_argument("-v","--verbose",default=False,action="store_true",help="print extra information (site list)") |
| 71 | + parser.add_argument("dataset",type=str,help="dataset to query") |
| 72 | + args = parser.parse_args() |
| 73 | + |
| 74 | + main(args.dataset, outfile=args.outfile, verbose=args.verbose) |
0 commit comments