Skip to content

Commit e75f7dd

Browse files
authored
Merge pull request DIRACGrid#7954 from aldbr/cherry-pick-2-d5aebd11d-integration
[sweep:integration] fix(wms): correctly log the pilot job reference during the matching process
2 parents 58472fd + 01b5238 commit e75f7dd

File tree

9 files changed

+271
-80
lines changed

9 files changed

+271
-80
lines changed

docs/source/DeveloperGuide/AddingNewComponents/Utilities/gLogger/gLogger/Advanced/index.rst

+62
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,68 @@ This option can not be modified in the children of *gLogger*, even by
217217
*gLogger* itself after the configuration, so the children receive
218218
the *gLogger* configuration.
219219

220+
Add variables to different *Logging* objects depending on the context
221+
---------------------------------------------------------------------
222+
223+
In complex cases, it can be useful to have loggers that change depending on
224+
the execution context, without having to pass logger instances explicitly
225+
through multiple layers of function calls.
226+
227+
Python's `contextvars` module provides context-local storage, which can be used
228+
to store and retrieve context-specific data, such as logger instances.
229+
230+
gLogger supports the use of context variables to manage loggers in a flexible way.
231+
232+
Provide a Context Logger
233+
~~~~~~~~~~~~~~~~~~~~~~~~
234+
235+
When you have a *Logging* instance that you want to use in a specific context,
236+
you can set it in the context variable:
237+
238+
::
239+
240+
# Create a logger instance
241+
logger = gLogger.getSubLogger("MyContextLogger")
242+
243+
# Set it in the context variable
244+
contextLogger.set(logger)
245+
246+
Then, the instances within the context block will use the shared *Logging* object
247+
set in the context variable:
248+
249+
::
250+
251+
with setContextLogger(contextualLogger):
252+
# Any logging within this block will use contextualLogger
253+
obj = MyClass()
254+
obj.do_something() # This will use contextualLogger
255+
256+
Consume a Context Logger
257+
~~~~~~~~~~~~~~~~~~~~~~~~
258+
259+
In functions or classes that need to log messages, you can retrieve the logger
260+
from the context variable:
261+
262+
::
263+
264+
class MyClass:
265+
def __init__(self):
266+
# Get the default logger if no context logger is set
267+
self._defaultLogger = gLogger.getSubLogger("MyClass")
268+
269+
@property
270+
def log(self):
271+
# Return the context logger if set, otherwise the default logger
272+
return contextLogger.get() or self._defaultLogger
273+
274+
@log.setter
275+
def log(self, value):
276+
# Optionally, allow setting a new default logger
277+
self._defaultLogger = value
278+
279+
def do_something(self):
280+
self.log.notice("Doing something")
281+
220282
Some examples and summaries
221283
---------------------------
222284

Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1+
from DIRAC.FrameworkSystem.private.standardLogging.LoggingContext import contextLogger, setContextLogger
12
from DIRAC.FrameworkSystem.private.standardLogging.LoggingRoot import LoggingRoot
23

34
gLogger = LoggingRoot()
45

56

67
def getLogger():
78
return gLogger
9+
10+
11+
__all__ = ["contextLogger", "setContextLogger", "getLogger"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
""" Logging context module"""
2+
3+
# Context variable for the logger (adapted to the request of the pilot reference)
4+
import contextvars
5+
from contextlib import contextmanager
6+
7+
contextLogger = contextvars.ContextVar("Logger", default=None)
8+
9+
10+
@contextmanager
11+
def setContextLogger(logger_name):
12+
token = contextLogger.set(logger_name)
13+
try:
14+
yield
15+
finally:
16+
contextLogger.reset(token)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
""" Test the context variable logger """
2+
3+
from DIRAC import gLogger
4+
from DIRAC.FrameworkSystem.private.standardLogging.Logging import Logging
5+
from DIRAC.FrameworkSystem.private.standardLogging.test.TestLogUtilities import gLoggerReset
6+
from DIRAC.FrameworkSystem.private.standardLogging.LoggingContext import contextLogger, setContextLogger
7+
8+
9+
class A:
10+
def __init__(self):
11+
# Get the logger from the context variable
12+
self._defaultLogger = gLogger.getSubLogger("A")
13+
14+
# Use a property to get and set the logger, this is necessary to use the context variable
15+
@property
16+
def log(self):
17+
return contextLogger.get() or self._defaultLogger
18+
19+
@log.setter
20+
def log(self, value: Logging):
21+
self._defaultLogger = value
22+
23+
def do_something(self):
24+
self.log.notice("A is doing something")
25+
26+
27+
class B:
28+
def __init__(self, a: A, pilotRef: str = None):
29+
self.a = A()
30+
31+
# Get the logger from the context variable
32+
if pilotRef:
33+
self.log = gLogger.getLocalSubLogger(f"[{pilotRef}]B")
34+
contextLogger.set(self.log)
35+
else:
36+
self.log = gLogger.getSubLogger("B")
37+
38+
def do_something_else(self):
39+
with setContextLogger(self.log):
40+
self.a.do_something()
41+
self.log.notice("B is doing something else")
42+
43+
44+
def test_contextvar_logger():
45+
capturedBackend, log, sublog = gLoggerReset()
46+
47+
# Create an instance of A
48+
a = A()
49+
50+
# Create an instance of B and call its method without setting the pilotRef
51+
# Log signature coming from A and B should be different
52+
b1 = B(a)
53+
b1.do_something_else()
54+
assert "Framework/B NOTICE: A is doing something" in capturedBackend.getvalue()
55+
assert "Framework/B NOTICE: B is doing something else" in capturedBackend.getvalue()
56+
57+
# Create an instance of B and call its method with setting the pilotRef
58+
# Log signature coming from A and B should be similar because of the pilotRef
59+
capturedBackend.truncate(0)
60+
61+
b2 = B(a, "pilotRef")
62+
b2.do_something_else()
63+
assert "Framework/[pilotRef]B NOTICE: A is doing something" in capturedBackend.getvalue()
64+
assert "Framework/[pilotRef]B NOTICE: B is doing something else" in capturedBackend.getvalue()
65+
66+
# Now we check that the logger of b1 is not the same as the logger of b2 (b1 should still use its own logger)
67+
capturedBackend.truncate(0)
68+
69+
b1.do_something_else()
70+
assert "Framework/B NOTICE: A is doing something" in capturedBackend.getvalue()
71+
assert "Framework/B NOTICE: B is doing something else" in capturedBackend.getvalue()

src/DIRAC/WorkloadManagementSystem/Client/Matcher.py

+77-80
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
1010
from DIRAC.Core.Security import Properties
1111
from DIRAC.Core.Utilities.PrettyPrint import printDict
12+
from DIRAC.FrameworkSystem.Client.Logger import setContextLogger
1213
from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus
1314
from DIRAC.WorkloadManagementSystem.Client import JobStatus, PilotStatus
1415
from DIRAC.WorkloadManagementSystem.Client.Limiter import Limiter
@@ -50,11 +51,7 @@ def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelp
5051
self.opsHelper = Operations()
5152

5253
if pilotRef:
53-
self.log = gLogger.getSubLogger(f"[{pilotRef}]Matcher")
54-
self.pilotAgentsDB.log = gLogger.getSubLogger(f"[{pilotRef}]Matcher")
55-
self.jobDB.log = gLogger.getSubLogger(f"[{pilotRef}]Matcher")
56-
self.tqDB.log = gLogger.getSubLogger(f"[{pilotRef}]Matcher")
57-
self.jlDB.log = gLogger.getSubLogger(f"[{pilotRef}]Matcher")
54+
self.log = gLogger.getLocalSubLogger(f"[{pilotRef}]Matcher")
5855
else:
5956
self.log = gLogger.getSubLogger("Matcher")
6057

@@ -64,86 +61,86 @@ def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelp
6461

6562
def selectJob(self, resourceDescription, credDict):
6663
"""Main job selection function to find the highest priority job matching the resource capacity"""
64+
with setContextLogger(self.log):
65+
startTime = time.time()
66+
67+
resourceDict = self._getResourceDict(resourceDescription, credDict)
68+
69+
# Make a nice print of the resource matching parameters
70+
toPrintDict = dict(resourceDict)
71+
if "MaxRAM" in resourceDescription:
72+
toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"]
73+
if "NumberOfProcessors" in resourceDescription:
74+
toPrintDict["NumberOfProcessors"] = resourceDescription["NumberOfProcessors"]
75+
toPrintDict["Tag"] = []
76+
if "Tag" in resourceDict:
77+
for tag in resourceDict["Tag"]:
78+
if not tag.endswith("GB") and not tag.endswith("Processors"):
79+
toPrintDict["Tag"].append(tag)
80+
if not toPrintDict["Tag"]:
81+
toPrintDict.pop("Tag")
82+
self.log.info("Resource description for matching", printDict(toPrintDict))
83+
84+
negativeCond = self.limiter.getNegativeCondForSite(resourceDict["Site"], resourceDict.get("GridCE"))
85+
result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond)
6786

68-
startTime = time.time()
69-
70-
resourceDict = self._getResourceDict(resourceDescription, credDict)
71-
72-
# Make a nice print of the resource matching parameters
73-
toPrintDict = dict(resourceDict)
74-
if "MaxRAM" in resourceDescription:
75-
toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"]
76-
if "NumberOfProcessors" in resourceDescription:
77-
toPrintDict["NumberOfProcessors"] = resourceDescription["NumberOfProcessors"]
78-
toPrintDict["Tag"] = []
79-
if "Tag" in resourceDict:
80-
for tag in resourceDict["Tag"]:
81-
if not tag.endswith("GB") and not tag.endswith("Processors"):
82-
toPrintDict["Tag"].append(tag)
83-
if not toPrintDict["Tag"]:
84-
toPrintDict.pop("Tag")
85-
self.log.info("Resource description for matching", printDict(toPrintDict))
86-
87-
negativeCond = self.limiter.getNegativeCondForSite(resourceDict["Site"], resourceDict.get("GridCE"))
88-
result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond)
89-
90-
if not result["OK"]:
91-
raise RuntimeError(result["Message"])
92-
result = result["Value"]
93-
if not result["matchFound"]:
94-
self.log.info("No match found")
95-
return {}
96-
97-
jobID = result["jobId"]
98-
resAtt = self.jobDB.getJobAttributes(jobID, ["Status"])
99-
if not resAtt["OK"]:
100-
raise RuntimeError("Could not retrieve job attributes")
101-
if not resAtt["Value"]:
102-
raise RuntimeError("No attributes returned for job")
103-
if not resAtt["Value"]["Status"] == "Waiting":
104-
self.log.error("Job matched by the TQ is not in Waiting state", str(jobID))
105-
result = self.tqDB.deleteJob(jobID)
10687
if not result["OK"]:
10788
raise RuntimeError(result["Message"])
108-
raise RuntimeError(f"Job {str(jobID)} is not in Waiting state")
89+
result = result["Value"]
90+
if not result["matchFound"]:
91+
self.log.info("No match found")
92+
return {}
93+
94+
jobID = result["jobId"]
95+
resAtt = self.jobDB.getJobAttributes(jobID, ["Status"])
96+
if not resAtt["OK"]:
97+
raise RuntimeError("Could not retrieve job attributes")
98+
if not resAtt["Value"]:
99+
raise RuntimeError("No attributes returned for job")
100+
if not resAtt["Value"]["Status"] == "Waiting":
101+
self.log.error("Job matched by the TQ is not in Waiting state", str(jobID))
102+
result = self.tqDB.deleteJob(jobID)
103+
if not result["OK"]:
104+
raise RuntimeError(result["Message"])
105+
raise RuntimeError(f"Job {str(jobID)} is not in Waiting state")
109106

110-
self._reportStatus(resourceDict, jobID)
107+
self._reportStatus(resourceDict, jobID)
111108

112-
result = self.jobDB.getJobJDL(jobID)
113-
if not result["OK"]:
114-
raise RuntimeError("Failed to get the job JDL")
115-
116-
resultDict = {}
117-
resultDict["JDL"] = result["Value"]
118-
resultDict["JobID"] = jobID
119-
120-
matchTime = time.time() - startTime
121-
self.log.verbose("Match time", f"[{str(matchTime)}]")
122-
123-
# Get some extra stuff into the response returned
124-
resOpt = self.jobDB.getJobOptParameters(jobID)
125-
if resOpt["OK"]:
126-
for key, value in resOpt["Value"].items():
127-
resultDict[key] = value
128-
resAtt = self.jobDB.getJobAttributes(jobID, ["Owner", "OwnerGroup"])
129-
if not resAtt["OK"]:
130-
raise RuntimeError("Could not retrieve job attributes")
131-
if not resAtt["Value"]:
132-
raise RuntimeError("No attributes returned for job")
133-
134-
if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
135-
self.limiter.updateDelayCounters(resourceDict["Site"], jobID)
136-
137-
pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag", False)
138-
if not pilotInfoReportedFlag:
139-
self._updatePilotInfo(resourceDict)
140-
self._updatePilotJobMapping(resourceDict, jobID)
141-
142-
resultDict["Owner"] = resAtt["Value"]["Owner"]
143-
resultDict["Group"] = resAtt["Value"]["OwnerGroup"]
144-
resultDict["PilotInfoReportedFlag"] = True
145-
146-
return resultDict
109+
result = self.jobDB.getJobJDL(jobID)
110+
if not result["OK"]:
111+
raise RuntimeError("Failed to get the job JDL")
112+
113+
resultDict = {}
114+
resultDict["JDL"] = result["Value"]
115+
resultDict["JobID"] = jobID
116+
117+
matchTime = time.time() - startTime
118+
self.log.verbose("Match time", f"[{str(matchTime)}]")
119+
120+
# Get some extra stuff into the response returned
121+
resOpt = self.jobDB.getJobOptParameters(jobID)
122+
if resOpt["OK"]:
123+
for key, value in resOpt["Value"].items():
124+
resultDict[key] = value
125+
resAtt = self.jobDB.getJobAttributes(jobID, ["Owner", "OwnerGroup"])
126+
if not resAtt["OK"]:
127+
raise RuntimeError("Could not retrieve job attributes")
128+
if not resAtt["Value"]:
129+
raise RuntimeError("No attributes returned for job")
130+
131+
if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
132+
self.limiter.updateDelayCounters(resourceDict["Site"], jobID)
133+
134+
pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag", False)
135+
if not pilotInfoReportedFlag:
136+
self._updatePilotInfo(resourceDict)
137+
self._updatePilotJobMapping(resourceDict, jobID)
138+
139+
resultDict["Owner"] = resAtt["Value"]["Owner"]
140+
resultDict["Group"] = resAtt["Value"]["OwnerGroup"]
141+
resultDict["PilotInfoReportedFlag"] = True
142+
143+
return resultDict
147144

148145
def _getResourceDict(self, resourceDescription, credDict):
149146
"""from resourceDescription to resourceDict (just various mods)"""

src/DIRAC/WorkloadManagementSystem/DB/JobDB.py

+11
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from DIRAC.Core.Utilities.Decorators import deprecated
2222
from DIRAC.Core.Utilities.DErrno import EWMSJMAN, EWMSSUBM, cmpError
2323
from DIRAC.Core.Utilities.ReturnValues import S_ERROR, S_OK
24+
from DIRAC.FrameworkSystem.Client.Logger import contextLogger
2425
from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus
2526
from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus, JobStatus
2627
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
@@ -42,6 +43,8 @@ def __init__(self, parentLogger=None):
4243

4344
DB.__init__(self, "JobDB", "WorkloadManagement/JobDB", parentLogger=parentLogger)
4445

46+
self._defaultLogger = self.log
47+
4548
# data member to check if __init__ went through without error
4649
self.__initialized = False
4750
self.maxRescheduling = self.getCSOption("MaxRescheduling", 3)
@@ -64,6 +67,14 @@ def __init__(self, parentLogger=None):
6467
self.log.info("==================================================")
6568
self.__initialized = True
6669

70+
@property
71+
def log(self):
72+
return contextLogger.get() or self._defaultLogger
73+
74+
@log.setter
75+
def log(self, value):
76+
self._defaultLogger = value
77+
6778
def isValid(self):
6879
"""Check if correctly initialised"""
6980
return self.__initialized

0 commit comments

Comments
 (0)