fix(api): Fix metrics lifecycle policies tasks (#29)

hayk96 · web-flow · commit 8e30efb6b0ff · 2024-06-16T18:29:39.000+04:00
* chore(api): Add endpoint for triggering MLP / Run MLP every 2 hours

* chore(api): Bump app version #patch

---------

Signed-off-by: hayk96 &lt;hayko5999@gmail.com&gt;
diff --git a/.dockerignore b/.dockerignore
@@ -1,3 +1,4 @@
 .venv/
+.idea/
 tests/
 docs/
diff --git a/.gitignore b/.gitignore
@@ -157,7 +157,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
 
 # User defined files
 docs/examples/docker/rules
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 0.3.3 / 2024-06-16
+
+* [ENHANCEMENT] Added a new endpoint: `/metrics-lifecycle-policies/trigger` for force-triggering all Metrics Lifecycle Policies. #29
+* [CHANGE] Changed the execution interval of the task "Clean-up Prometheus series" by the scheduler to 2 hours (previously 20 minutes).
+* [BUGFIX] Prevented the execution of more than one task at the same time, as tasks can remain in the running state for longer than their execution interval.
+
 ## 0.3.2 / 2024-06-08
 
 * [ENHANCEMENT] Added a new endpoint: `/health` for retrieving system health. #28
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,6 @@
 FROM python:3.10-alpine
 LABEL maintainer="Hayk Davtyan <hayko5999@gmail.com>"
+ENV TZ UTC
 WORKDIR app
 COPY . .
 RUN python -m pip install --no-cache-dir -r requirements.txt
diff --git a/src/api/v1/endpoints/policies.py b/src/api/v1/endpoints/policies.py
@@ -1,7 +1,10 @@
 from src.models.policy import MetricsLifecyclePolicyCreate, MetricsLifecyclePolicyUpdate
 from fastapi import APIRouter, Response, Request, Body, status
+from apscheduler.triggers.date import DateTrigger
+from src.utils.scheduler import schedule
 from src.core import policies as mlp
 from src.utils.log import logger
+from datetime import datetime
 from typing import Annotated
 
 router = APIRouter()
@@ -20,7 +23,7 @@
                         "application/json": {
                             "example": [
                                 {
-                                    "match": "{__name__=~'go_.+'}",
+                                    "match": "{__name__=~'go_.*'}",
                                     "keep_for": "7d",
                                     "description": "This metrics lifecycle policy keeps GoLang metrics for 7 days"
                                 }
@@ -78,7 +81,7 @@ async def get_policy(
                             "example": [
                                 {
                                     "GoLang Policy": {
-                                        "match": "{__name__=~'go_.+'}",
+                                        "match": "{__name__=~'go_.*'}",
                                         "keep_for": "7d",
                                         "message": "This policy keeps GoLang metrics for 7 days"
                                     },
@@ -109,8 +112,8 @@ async def get_policies(
 
 
 @router.post("/metrics-lifecycle-policies",
-             name="Create metric lifecycle policy",
-             description="Creates a new metric lifecycle policy",
+             name="Create metrics lifecycle policy",
+             description="Creates a new metrics lifecycle policy",
              status_code=status.HTTP_201_CREATED,
              tags=["metrics-lifecycle-policies"],
              responses={
@@ -354,3 +357,57 @@ async def delete(
     return {
         "status": sts,
         "message": msg} if response.status_code != 204 else response.status_code
+
+
+@router.post("/metrics-lifecycle-policies/trigger",
+             name="Trigger metrics lifecycle policies",
+             description="Force triggers all new metrics lifecycle policies",
+             status_code=status.HTTP_202_ACCEPTED,
+             tags=["metrics-lifecycle-policies"],
+             responses={
+                  202: {
+                      "description": "Accepted",
+                      "content": {
+                          "application/json": {
+                              "example": [
+                                  {
+                                      "status": "success",
+                                      "message": "Your request has been accepted for processing"
+                                  }
+                              ]
+                          }
+                      }
+                  },
+                 409: {
+                     "description": "Conflict",
+                     "content": {
+                         "application/json": {
+                             "example": [
+                                 {
+                                     "status": "error",
+                                     "message": "Cannot create a new task. Server is currently processing another task"
+                                 }
+                             ]
+                         }
+                     }
+                  },
+             }
+             )
+async def trigger(request: Request, response: Response):
+    from src.tasks.policies import running_tasks
+    if not running_tasks:
+        schedule(trigger=DateTrigger(run_date=datetime.now()))
+        response.status_code, sts, msg = 202, "success", "Request has been accepted for processing"
+    else:
+        response.status_code, sts, msg = 409, "error", \
+            "Cannot create a new task. Server is currently processing another task"
+    logger.info(
+        msg=msg,
+        extra={
+            "status": response.status_code,
+            "method": request.method,
+            "request_path": request.url.path})
+    return {
+        "status": sts,
+        "message": msg
+    }
diff --git a/src/models/policy.py b/src/models/policy.py
@@ -12,7 +12,7 @@ class MetricsLifecyclePolicyCreate(BaseModel, extra=Extra.allow):
             "description": "Time-series matching with regex will be kept for 7 days",
             "value": {
                 "name": "Example Policy",
-                "match": "{__name__=~'go_.+'}",
+                "match": "{__name__=~'go_.*'}",
                 "keep_for": "7d",
                 "description": "Time-series matching with regex will be kept for 7 days."
             }
diff --git a/src/tasks/policies.py b/src/tasks/policies.py
@@ -2,62 +2,85 @@
 from src.utils.arguments import arg_parser
 from src.utils.log import logger
 from pytimeparse2 import parse
+from time import time
 import requests
-import time
+
 
 prom_addr = arg_parser().get("prom.addr")
+running_tasks = False
 
 
-def delete_series(policy_name: str, policy: dict) -> None:
+def delete_series(policy_name: str, policy: dict) -> bool:
     """
-    This function calls two Prometheus endpoints:
-    * POST /api/v1/admin/tsdb/delete_series
-    * POST /api/v1/admin/tsdb/clean_tombstones
+    This function calls following Prometheus endpoint:
+    POST /api/v1/admin/tsdb/delete_series
     User-defined policies passed to this function
-    perform cleanup based on the specified policy settings.
+    perform clean-up based on the specified policy settings.
     """
-    time_range = time.time() - parse(policy["keep_for"])
-    start_time = time.time()
+    time_range = time() - parse(policy["keep_for"])
     try:
         r = requests.post(
             f'{prom_addr}/api/v1/admin/tsdb/delete_series?match[]={policy["match"]}&end={time_range}')
     except BaseException as e:
         logger.error(e, extra={"policy_name": policy_name})
     else:
-        if r.status_code != 204:
-            logger.error(f"Failed to delete series, {r.json().get('error')}", extra={
-                         "status": r.status_code, "policy_name": policy_name})
-            return
-        try:
-            r = requests.post(
-                f'{prom_addr}/api/v1/admin/tsdb/clean_tombstones')
-        except BaseException as e:
-            logger.error(e, extra={"policy_name": policy_name})
-            return
-        else:
-            if r.status_code != 204:
-                logger.error(f"Failed to clean tombstones, {r.json().get('error')}", extra={
-                             "status": r.status_code, "policy_name": policy_name})
-                return
-        exec_time = float("{:.2f}".format(time.time() - start_time))
-        logger.debug("Task cleanup time-series has been successfully completed",
-                     extra={"policy_name": policy_name, "exec_time": exec_time})
-        return
-
-
-def task_run_policies():
+        if r.status_code == 204:
+            logger.debug("Task clean-up time-series has been successfully completed",
+                         extra={"policy_name": policy_name})
+            return True
+        logger.error(f"Failed to delete series, {r.json().get('error')}", extra={
+                     "status": r.status_code, "policy_name": policy_name})
+    return False
+
+
+def clean_tombstones() -> bool:
+    """
+    This function calls following Prometheus endpoint:
+    POST /api/v1/admin/tsdb/clean_tombstones
+    Removes the deleted data from disk and
+    cleans up the existing tombstones
+    """
+    try:
+        r = requests.post(
+            f'{prom_addr}/api/v1/admin/tsdb/clean_tombstones')
+    except BaseException as e:
+        logger.error(e)
+    else:
+        if r.status_code == 204:
+            return True
+        logger.error(f"Failed to clean tombstones, {r.json().get('error')}", extra={
+            "status": r.status_code})
+    return False
+
+
+def run_policies() -> bool:
     """
     This function loops over user-defined metrics lifecycle
-    policies and executes the cleanup job one by one
+    policies and executes the clean-up job one by one
     """
+    global running_tasks
+    if running_tasks:
+        logger.warning(
+            "Cannot create a new task. Server is currently processing another task")
+        return False
+
     policies = load_policies()
     if policies:
         logger.debug(
             f"Found {len(policies)} metrics lifecycle {'policies' if len(policies) > 1 else 'policy'}. "
-            f"Starting job to cleanup time-series.")
+            f"Starting job to clean-up time-series.")
+        running_tasks = True
+        start_time = time()
         for p in policies:
             logger.debug(
-                "Task cleanup time-series is in progress", extra={
+                "Task clean-up series is in progress", extra={
                     "policy_name": p, "match": policies[p]["match"],
                     "keep_for": policies[p]["keep_for"]})
             delete_series(policy_name=p, policy=policies[p])
+        clean_tombstones()
+        exec_time = float("{:.2f}".format(time() - start_time))
+        running_tasks = False
+        logger.debug(
+            "Task clean-up series has been completed", extra={
+                "duration": exec_time})
+    return True
diff --git a/src/utils/openapi.py b/src/utils/openapi.py
@@ -16,7 +16,7 @@ def openapi(app: FastAPI):
                     "providing additional features and addressing its limitations. "
                     "Running as a sidecar alongside the Prometheus server enables "
                     "users to extend the capabilities of the API.",
-        version="0.3.2",
+        version="0.3.3",
         contact={
             "name": "Hayk Davtyan",
             "url": "https://hayk96.github.io",
diff --git a/src/utils/scheduler.py b/src/utils/scheduler.py
@@ -1,16 +1,16 @@
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.interval import IntervalTrigger
-from src.tasks.policies import task_run_policies
+from src.tasks.policies import run_policies
 import atexit
 
 
-def schedule():
+def schedule(trigger=IntervalTrigger(hours=2)):
     scheduler = BackgroundScheduler()
     scheduler.start()
     scheduler.add_job(
-        func=task_run_policies,
-        trigger=IntervalTrigger(minutes=20),
-        name='Schedule task "cleanup time-series" every 20 minutes',
-        replace_existing=True
+        func=run_policies,
+        trigger=trigger,
+        replace_existing=True,
+        name="Clean-up Prometheus time-series"
     )
     atexit.register(lambda: scheduler.shutdown())
diff --git a/ui/homepage/index.html b/ui/homepage/index.html
@@ -4,6 +4,7 @@
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>Extended HTTP API service for Prometheus</title>
+    <link rel="icon" type="image/png" href="https://raw.githubusercontent.com/hayk96/prometheus-api/main/docs/images/logo.png">
     <style>
         body, h1, ul, li, a {
             margin: 0;
diff --git a/ui/metrics-management/index.html b/ui/metrics-management/index.html
@@ -5,6 +5,7 @@
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>Metrics Management</title>
     <link rel="stylesheet" href="/metrics-management/style.css">
+    <link rel="icon" type="image/png" href="https://raw.githubusercontent.com/hayk96/prometheus-api/main/docs/images/logo.png">
 </head>
 <body>
     <div id="sidebar" class="sidebar">
diff --git a/ui/rules-management/index.html b/ui/rules-management/index.html
@@ -3,8 +3,9 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Edit Rule</title>
+    <title>Rules Management</title>
     <link rel="stylesheet" href="/rules-management/style.css">
+    <link rel="icon" type="image/png" href="https://raw.githubusercontent.com/hayk96/prometheus-api/main/docs/images/logo.png">
     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.2/codemirror.min.css">
     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.2/theme/monokai.min.css">
 </head>

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 .venv/
 +.idea/
 tests/
 docs/
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ class MetricsLifecyclePolicyCreate(BaseModel, extra=Extra.allow):`
`12`	`12`	`"description": "Time-series matching with regex will be kept for 7 days",`
`13`	`13`	`"value": {`
`14`	`14`	`"name": "Example Policy",`
`15`		`- "match": "{__name__=~'go_.+'}",`
	`15`	`+ "match": "{__name__=~'go_.*'}",`
`16`	`16`	`"keep_for": "7d",`
`17`	`17`	`"description": "Time-series matching with regex will be kept for 7 days."`
`18`	`18`	`}`