Skip to content

Commit 7d9322b

Browse files
authored
Merge pull request #979 from juanpablosalas/fi_server
Moving FI server scripts to CMSRucio project
2 parents 3954a13 + 0a2321e commit 7d9322b

File tree

21 files changed

+996
-0
lines changed

21 files changed

+996
-0
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
apiVersion: batch/v1
2+
kind: CronJob
3+
metadata:
4+
name: jobs-log-processor
5+
spec:
6+
schedule: "*/5 * * * *" # every 1 minutes
7+
jobTemplate:
8+
spec:
9+
template:
10+
spec:
11+
containers:
12+
- name: log-processor
13+
image: registry.paas.cern.ch/file-invalidation-tool/file-invalidation-tool:latest
14+
command: ["python3", "fi_manager/process_jobs.py"]
15+
restartPolicy: Never
16+
serviceAccountName: job-reader
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: file-invalidation-job
5+
spec:
6+
template:
7+
spec:
8+
initContainers:
9+
- name: debug-voms
10+
image: registry.cern.ch/cmsrucio/file_invalidation_tool:latest
11+
command: ["/bin/sh", "-c", "
12+
cp /tmp/usercert.pem /certs/usercert.pem;
13+
cp /tmp/userkey.pem /certs/userkey.pem;
14+
ls -l /certs/usercert.pem;
15+
stat -c '%a' /certs/usercert.pem;
16+
ls -l /certs/userkey.pem;
17+
stat -c '%a' /certs/userkey.pem;
18+
chmod 400 /certs/usercert.pem;
19+
chmod 600 /certs/userkey.pem;
20+
voms-proxy-init -voms cms -rfc -valid 192:00 --cert '/certs/usercert.pem' --key '/certs/userkey.pem';"]
21+
volumeMounts:
22+
- name: user-cert
23+
mountPath: "/tmp/usercert.pem"
24+
subPath: "usercert.pem"
25+
- name: user-key
26+
mountPath: "/tmp/userkey.pem"
27+
subPath: "userkey.pem"
28+
- name: vomses-volume
29+
mountPath: "/etc/vomses/"
30+
readOnly: true
31+
- name: certs
32+
mountPath: /certs
33+
- name: debug-cvmfs
34+
image: registry.cern.ch/cmsrucio/file_invalidation_tool:latest
35+
command:
36+
- /bin/sh
37+
- -c
38+
- |
39+
set -e;
40+
echo 'Waiting for 30s';
41+
sleep 30;
42+
# Check /cvmfs/cms.cern.ch/
43+
ls -l /cvmfs/cms.cern.ch/ | grep 'rucio';
44+
# Check /cvmfs/cms.cern.ch/rucio/
45+
ls -l /cvmfs/cms.cern.ch/rucio/ | grep 'rucio.cfg';
46+
47+
[ -d /input/ ] && rm -f /input/rucio* /input/dbs_files_inv.txt /input/datasets_inv.txt
48+
volumeMounts:
49+
- name: cvmfs-volume
50+
mountPath: "/cvmfs/"
51+
readOnly: true
52+
- name: input-file
53+
mountPath: "/input/"
54+
containers:
55+
- name: invalidation-tool
56+
image: registry.cern.ch/cmsrucio/file_invalidation_tool:latest
57+
imagePullPolicy: Always
58+
args: ["global","--reason", "$(REASON)", "--rucio-mode", "--dry-run"]
59+
#command: ["sleep","infinity"]
60+
volumeMounts:
61+
- name: input-file
62+
mountPath: "/input/"
63+
- name: dmtops-keytab
64+
mountPath: "/secrets/dmtops.keytab"
65+
subPath: "dmtops.keytab"
66+
- name: cvmfs-volume
67+
mountPath: "/cvmfs/"
68+
readOnly: true
69+
- name: vomses-volume
70+
mountPath: "/etc/vomses/"
71+
readOnly: true
72+
- name: certs
73+
mountPath: "/certs"
74+
env: #Pass environment variables to the container.
75+
- name: REASON
76+
value: "your_reason_here" #replace with your reason.
77+
restartPolicy: Never
78+
serviceAccountName: job-runner
79+
volumes:
80+
- name: input-file
81+
persistentVolumeClaim:
82+
claimName: input-file-pvc
83+
- name: user-cert
84+
configMap:
85+
name: user-certificates
86+
items:
87+
- key: usercert.pem
88+
path: usercert.pem
89+
- name: user-key
90+
configMap:
91+
name: user-certificates
92+
items:
93+
- key: userkey.pem
94+
path: userkey.pem
95+
- name: dmtops-keytab
96+
secret:
97+
secretName: dmtops-keytab
98+
items:
99+
- key: dmtops.keytab
100+
path: dmtops.keytab
101+
- name: vomses-volume
102+
configMap:
103+
name: vomses-config
104+
- name: cvmfs-volume
105+
persistentVolumeClaim:
106+
claimName: cvmfs-volume
107+
- name: certs
108+
emptyDir: {}
109+
backoffLimit: 4
110+
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import pymysql
2+
3+
pymysql.install_as_MySQLdb()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from django.contrib import admin
2+
3+
# Register your models here.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from django.apps import AppConfig
2+
3+
4+
class FiManagerConfig(AppConfig):
5+
default_auto_field = 'django.db.models.BigAutoField'
6+
name = 'fi_manager'
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# This is an auto-generated Django model module.
2+
# You'll have to do the following manually to clean this up:
3+
# * Rearrange models' order
4+
# * Make sure each model has one field with primary_key=True
5+
# * Make sure each ForeignKey and OneToOneField has `on_delete` set to the desired behavior
6+
# * Remove `managed = False` lines if you wish to allow Django to create, modify, and delete the table
7+
# Feel free to rename the models, but don't rename db_table values or field names.
8+
from django.db import models
9+
import uuid
10+
11+
class FileInvalidationRequests(models.Model):
12+
id = models.AutoField(primary_key=True)
13+
request_id = models.UUIDField(default=uuid.uuid4, editable=False, unique=True)
14+
file_name = models.CharField(max_length=255)
15+
status = models.CharField(max_length=20)
16+
mode = models.CharField(max_length=10)
17+
dry_run = models.BooleanField()
18+
reason = models.TextField(blank=True, null=True)
19+
job_id = models.CharField(max_length=8,null=True,blank=True)
20+
logs = models.TextField()
21+
global_invalidate_last_replicas = models.BooleanField(default=False)
22+
23+
class Meta:
24+
managed = False
25+
db_table = 'file_invalidation_requests'
26+
unique_together = (('request_id', 'file_name'),)
27+
28+
def __str__(self):
29+
#return f"{self.request_id} - {self.name}"
30+
return f"ID#: {self.id} REQUEST NUMBER {self.request_id} FOR FILE {self.file_name}"
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import os
2+
import django
3+
import sys
4+
import re
5+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'file_invalidation_server.settings')
7+
django.setup()
8+
9+
from kubernetes import client, config
10+
from fi_manager.models import FileInvalidationRequests
11+
import logging
12+
13+
logging.basicConfig(level=logging.INFO,format='(%(asctime)s) [%(name)s] %(levelname)s: %(message)s')
14+
logger = logging.getLogger(__name__)
15+
16+
def fetch_and_process():
17+
config.load_incluster_config()
18+
batch_v1 = client.BatchV1Api()
19+
core_v1 = client.CoreV1Api()
20+
21+
namespace = 'file-invalidation-tool'
22+
jobs = batch_v1.list_namespaced_job(namespace=namespace)
23+
24+
for job in jobs.items:
25+
job_name = job.metadata.name
26+
if not job.status.conditions:
27+
continue
28+
29+
if job.kind == "CronJob" or 'jobs-log-processor' in job_name:
30+
continue
31+
32+
condition_types = {cond.type: cond.status for cond in job.status.conditions}
33+
if condition_types.get("Failed") == "True":
34+
logger.warning(f"Job {job_name} failed")
35+
elif condition_types.get("Complete") == "True":
36+
# Get pods created by this job
37+
pods = core_v1.list_namespaced_pod(
38+
namespace=namespace,
39+
label_selector=f"job-name={job_name}"
40+
)
41+
42+
# Use most recent pod
43+
try:
44+
latest_pod = sorted(
45+
pods.items,
46+
key=lambda pod: pod.status.start_time or pod.metadata.creation_timestamp,
47+
reverse=True
48+
)[0]
49+
except IndexError as e:
50+
logger.error(f"There are no pods under the {job_name} job name.")
51+
continue
52+
53+
54+
pod_name = latest_pod.metadata.name
55+
logger.info(f"Pod name: {pod_name}")
56+
logs = core_v1.read_namespaced_pod_log(pod_name, namespace=namespace)
57+
try:
58+
rucio_invalidated_dids, dbs_invalidated_dids, dry_run = parse_job_logs(logs)
59+
logger.info(logs)
60+
logger.info(f"Job {job_name} has invalidated {len(rucio_invalidated_dids)} dids on Rucio and {len(dbs_invalidated_dids)} dids on DBS.")
61+
logger.info(f"Job {job_name} has invalidated the following DIDs on Rucio: {rucio_invalidated_dids}")
62+
logger.info(f"Job {job_name} has invalidated the following DIDs on DBS: {dbs_invalidated_dids}")
63+
if (len(rucio_invalidated_dids)>0) or (len(dbs_invalidated_dids)>0):
64+
update_database(job_name, rucio_invalidated_dids, dbs_invalidated_dids, dry_run)
65+
logger.info(f"Job {job_name} has completed and the DIDs have updated.")
66+
else:
67+
raise Exception(f"Job {job_name} did not invalidate any DIDs on Rucio or DBS.")
68+
except Exception as e:
69+
logger.error(f"Job {pod_name} has failed with error: {e}")
70+
update_database_for_failed_job(pod_name,f'Job {pod_name} has failed with error: {str(e)}\n{logs}')
71+
72+
73+
74+
delete_opts = client.V1DeleteOptions(propagation_policy='Foreground')
75+
76+
#batch_v1.delete_namespaced_job(
77+
# name=job_name,
78+
# namespace=namespace,
79+
# body=delete_opts)
80+
logger.info(f"Job {job_name} would be deleted but is being kept for dev purposes.")
81+
82+
def parse_job_logs(logs: str):
83+
if "Error running shell script" in logs:
84+
raise Exception(f"Job has failed with error: Error running shell script")
85+
dry_run = 'Would declare file' in logs
86+
87+
if dry_run:
88+
rucio_invalidated_files = re.findall(pattern='(?:Would declare file) (\/[\w\/\-]+.root) as bad at',
89+
string=logs)
90+
91+
dbs_invalidated_files = re.findall(pattern='(?:Would invalidate file on DBS:) (\/[\w\/\-]+.root)\s',
92+
string=logs)
93+
else:
94+
rucio_invalidated_files = re.findall(pattern='(?:Declared file) (\/[\w\/\-]+.root) as bad at',
95+
string=logs)
96+
97+
dbs_invalidated_files = re.findall(pattern='(?:Invalidation OK for file:) (\/[\w\/\-]+.root)\s',
98+
string=logs)
99+
100+
dbs_invalidated_dataset = re.findall(pattern='(?:Invalidation OK for dataset:) (\/[\w\/\-]+.root)\s',
101+
string=logs)
102+
103+
if dbs_invalidated_dataset:
104+
# Assumes that for datasets, DBS dataset invalidation implies Rucio file invalidation
105+
dbs_invalidated_files = dbs_invalidated_files.append(dbs_invalidated_dataset)
106+
rucio_invalidated_files = rucio_invalidated_files.append(dbs_invalidated_dataset)
107+
108+
return rucio_invalidated_files, dbs_invalidated_files, dry_run
109+
110+
def update_database(job_name, rucio_list, dbs_list, dry_run):
111+
globally_invalidated_dids = set(rucio_list) & set(dbs_list)
112+
job_id = re.findall(pattern='file-invalidation-job-(\w{8})',string=job_name)[0]
113+
114+
job_files = FileInvalidationRequests.objects.filter(job_id=job_id)
115+
116+
only_rucio_invalidated = FileInvalidationRequests.objects.filter(job_id=job_id,file_name__in=rucio_list)
117+
only_rucio_invalidated.update(status='success',mode='rucio_only',dry_run=dry_run)
118+
119+
only_dbs_invalidated = FileInvalidationRequests.objects.filter(job_id=job_id,file_name__in=dbs_list)
120+
only_dbs_invalidated.update(status='success',mode='dbs_only',dry_run=dry_run)
121+
122+
global_invalidated = FileInvalidationRequests.objects.filter(job_id=job_id,file_name__in=globally_invalidated_dids)
123+
global_invalidated.update(status='success',mode='global',dry_run=dry_run)
124+
125+
def update_database_for_failed_job(job_name,logs):
126+
job_id = re.findall(pattern='file-invalidation-job-(\w{8})-\w',string=job_name)[0]
127+
failed_invalidation = FileInvalidationRequests.objects.filter(job_id=job_id)
128+
failed_invalidation.update(status='failed',logs=logs)
129+
130+
if __name__ == "__main__":
131+
fetch_and_process()
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/* static/css/styles.css */
2+
body {
3+
font-family: sans-serif;
4+
margin: 0;
5+
padding: 0;
6+
background-color: #f4f4f4;
7+
}
8+
9+
header, footer {
10+
background-color: #e0e0e0;
11+
padding: 20px;
12+
text-align: center;
13+
}
14+
15+
main {
16+
padding: 20px;
17+
}

0 commit comments

Comments
 (0)