Skip to content
This repository was archived by the owner on Jan 13, 2025. It is now read-only.

Commit 8e30efb

Browse files
authored
fix(api): Fix metrics lifecycle policies tasks (#29)
* chore(api): Add endpoint for triggering MLP / Run MLP every 2 hours * chore(api): Bump app version #patch --------- Signed-off-by: hayk96 <hayko5999@gmail.com>
1 parent 4374aa2 commit 8e30efb

File tree

12 files changed

+138
-47
lines changed

12 files changed

+138
-47
lines changed

.dockerignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
.venv/
2+
.idea/
23
tests/
34
docs/

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ cython_debug/
157157
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160-
#.idea/
160+
.idea/
161161

162162
# User defined files
163163
docs/examples/docker/rules

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## 0.3.3 / 2024-06-16
4+
5+
* [ENHANCEMENT] Added a new endpoint: `/metrics-lifecycle-policies/trigger` for force-triggering all Metrics Lifecycle Policies. #29
6+
* [CHANGE] Changed the execution interval of the task "Clean-up Prometheus series" by the scheduler to 2 hours (previously 20 minutes).
7+
* [BUGFIX] Prevented the execution of more than one task at the same time, as tasks can remain in the running state for longer than their execution interval.
8+
39
## 0.3.2 / 2024-06-08
410

511
* [ENHANCEMENT] Added a new endpoint: `/health` for retrieving system health. #28

Dockerfile

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
FROM python:3.10-alpine
22
LABEL maintainer="Hayk Davtyan <hayko5999@gmail.com>"
3+
ENV TZ UTC
34
WORKDIR app
45
COPY . .
56
RUN python -m pip install --no-cache-dir -r requirements.txt

src/api/v1/endpoints/policies.py

+61-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
from src.models.policy import MetricsLifecyclePolicyCreate, MetricsLifecyclePolicyUpdate
22
from fastapi import APIRouter, Response, Request, Body, status
3+
from apscheduler.triggers.date import DateTrigger
4+
from src.utils.scheduler import schedule
35
from src.core import policies as mlp
46
from src.utils.log import logger
7+
from datetime import datetime
58
from typing import Annotated
69

710
router = APIRouter()
@@ -20,7 +23,7 @@
2023
"application/json": {
2124
"example": [
2225
{
23-
"match": "{__name__=~'go_.+'}",
26+
"match": "{__name__=~'go_.*'}",
2427
"keep_for": "7d",
2528
"description": "This metrics lifecycle policy keeps GoLang metrics for 7 days"
2629
}
@@ -78,7 +81,7 @@ async def get_policy(
7881
"example": [
7982
{
8083
"GoLang Policy": {
81-
"match": "{__name__=~'go_.+'}",
84+
"match": "{__name__=~'go_.*'}",
8285
"keep_for": "7d",
8386
"message": "This policy keeps GoLang metrics for 7 days"
8487
},
@@ -109,8 +112,8 @@ async def get_policies(
109112

110113

111114
@router.post("/metrics-lifecycle-policies",
112-
name="Create metric lifecycle policy",
113-
description="Creates a new metric lifecycle policy",
115+
name="Create metrics lifecycle policy",
116+
description="Creates a new metrics lifecycle policy",
114117
status_code=status.HTTP_201_CREATED,
115118
tags=["metrics-lifecycle-policies"],
116119
responses={
@@ -354,3 +357,57 @@ async def delete(
354357
return {
355358
"status": sts,
356359
"message": msg} if response.status_code != 204 else response.status_code
360+
361+
362+
@router.post("/metrics-lifecycle-policies/trigger",
363+
name="Trigger metrics lifecycle policies",
364+
description="Force triggers all new metrics lifecycle policies",
365+
status_code=status.HTTP_202_ACCEPTED,
366+
tags=["metrics-lifecycle-policies"],
367+
responses={
368+
202: {
369+
"description": "Accepted",
370+
"content": {
371+
"application/json": {
372+
"example": [
373+
{
374+
"status": "success",
375+
"message": "Your request has been accepted for processing"
376+
}
377+
]
378+
}
379+
}
380+
},
381+
409: {
382+
"description": "Conflict",
383+
"content": {
384+
"application/json": {
385+
"example": [
386+
{
387+
"status": "error",
388+
"message": "Cannot create a new task. Server is currently processing another task"
389+
}
390+
]
391+
}
392+
}
393+
},
394+
}
395+
)
396+
async def trigger(request: Request, response: Response):
397+
from src.tasks.policies import running_tasks
398+
if not running_tasks:
399+
schedule(trigger=DateTrigger(run_date=datetime.now()))
400+
response.status_code, sts, msg = 202, "success", "Request has been accepted for processing"
401+
else:
402+
response.status_code, sts, msg = 409, "error", \
403+
"Cannot create a new task. Server is currently processing another task"
404+
logger.info(
405+
msg=msg,
406+
extra={
407+
"status": response.status_code,
408+
"method": request.method,
409+
"request_path": request.url.path})
410+
return {
411+
"status": sts,
412+
"message": msg
413+
}

src/models/policy.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class MetricsLifecyclePolicyCreate(BaseModel, extra=Extra.allow):
1212
"description": "Time-series matching with regex will be kept for 7 days",
1313
"value": {
1414
"name": "Example Policy",
15-
"match": "{__name__=~'go_.+'}",
15+
"match": "{__name__=~'go_.*'}",
1616
"keep_for": "7d",
1717
"description": "Time-series matching with regex will be kept for 7 days."
1818
}

src/tasks/policies.py

+56-33
Original file line numberDiff line numberDiff line change
@@ -2,62 +2,85 @@
22
from src.utils.arguments import arg_parser
33
from src.utils.log import logger
44
from pytimeparse2 import parse
5+
from time import time
56
import requests
6-
import time
7+
78

89
prom_addr = arg_parser().get("prom.addr")
10+
running_tasks = False
911

1012

11-
def delete_series(policy_name: str, policy: dict) -> None:
13+
def delete_series(policy_name: str, policy: dict) -> bool:
1214
"""
13-
This function calls two Prometheus endpoints:
14-
* POST /api/v1/admin/tsdb/delete_series
15-
* POST /api/v1/admin/tsdb/clean_tombstones
15+
This function calls following Prometheus endpoint:
16+
POST /api/v1/admin/tsdb/delete_series
1617
User-defined policies passed to this function
17-
perform cleanup based on the specified policy settings.
18+
perform clean-up based on the specified policy settings.
1819
"""
19-
time_range = time.time() - parse(policy["keep_for"])
20-
start_time = time.time()
20+
time_range = time() - parse(policy["keep_for"])
2121
try:
2222
r = requests.post(
2323
f'{prom_addr}/api/v1/admin/tsdb/delete_series?match[]={policy["match"]}&end={time_range}')
2424
except BaseException as e:
2525
logger.error(e, extra={"policy_name": policy_name})
2626
else:
27-
if r.status_code != 204:
28-
logger.error(f"Failed to delete series, {r.json().get('error')}", extra={
29-
"status": r.status_code, "policy_name": policy_name})
30-
return
31-
try:
32-
r = requests.post(
33-
f'{prom_addr}/api/v1/admin/tsdb/clean_tombstones')
34-
except BaseException as e:
35-
logger.error(e, extra={"policy_name": policy_name})
36-
return
37-
else:
38-
if r.status_code != 204:
39-
logger.error(f"Failed to clean tombstones, {r.json().get('error')}", extra={
40-
"status": r.status_code, "policy_name": policy_name})
41-
return
42-
exec_time = float("{:.2f}".format(time.time() - start_time))
43-
logger.debug("Task cleanup time-series has been successfully completed",
44-
extra={"policy_name": policy_name, "exec_time": exec_time})
45-
return
46-
47-
48-
def task_run_policies():
27+
if r.status_code == 204:
28+
logger.debug("Task clean-up time-series has been successfully completed",
29+
extra={"policy_name": policy_name})
30+
return True
31+
logger.error(f"Failed to delete series, {r.json().get('error')}", extra={
32+
"status": r.status_code, "policy_name": policy_name})
33+
return False
34+
35+
36+
def clean_tombstones() -> bool:
37+
"""
38+
This function calls following Prometheus endpoint:
39+
POST /api/v1/admin/tsdb/clean_tombstones
40+
Removes the deleted data from disk and
41+
cleans up the existing tombstones
42+
"""
43+
try:
44+
r = requests.post(
45+
f'{prom_addr}/api/v1/admin/tsdb/clean_tombstones')
46+
except BaseException as e:
47+
logger.error(e)
48+
else:
49+
if r.status_code == 204:
50+
return True
51+
logger.error(f"Failed to clean tombstones, {r.json().get('error')}", extra={
52+
"status": r.status_code})
53+
return False
54+
55+
56+
def run_policies() -> bool:
4957
"""
5058
This function loops over user-defined metrics lifecycle
51-
policies and executes the cleanup job one by one
59+
policies and executes the clean-up job one by one
5260
"""
61+
global running_tasks
62+
if running_tasks:
63+
logger.warning(
64+
"Cannot create a new task. Server is currently processing another task")
65+
return False
66+
5367
policies = load_policies()
5468
if policies:
5569
logger.debug(
5670
f"Found {len(policies)} metrics lifecycle {'policies' if len(policies) > 1 else 'policy'}. "
57-
f"Starting job to cleanup time-series.")
71+
f"Starting job to clean-up time-series.")
72+
running_tasks = True
73+
start_time = time()
5874
for p in policies:
5975
logger.debug(
60-
"Task cleanup time-series is in progress", extra={
76+
"Task clean-up series is in progress", extra={
6177
"policy_name": p, "match": policies[p]["match"],
6278
"keep_for": policies[p]["keep_for"]})
6379
delete_series(policy_name=p, policy=policies[p])
80+
clean_tombstones()
81+
exec_time = float("{:.2f}".format(time() - start_time))
82+
running_tasks = False
83+
logger.debug(
84+
"Task clean-up series has been completed", extra={
85+
"duration": exec_time})
86+
return True

src/utils/openapi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def openapi(app: FastAPI):
1616
"providing additional features and addressing its limitations. "
1717
"Running as a sidecar alongside the Prometheus server enables "
1818
"users to extend the capabilities of the API.",
19-
version="0.3.2",
19+
version="0.3.3",
2020
contact={
2121
"name": "Hayk Davtyan",
2222
"url": "https://hayk96.github.io",

src/utils/scheduler.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
from apscheduler.schedulers.background import BackgroundScheduler
22
from apscheduler.triggers.interval import IntervalTrigger
3-
from src.tasks.policies import task_run_policies
3+
from src.tasks.policies import run_policies
44
import atexit
55

66

7-
def schedule():
7+
def schedule(trigger=IntervalTrigger(hours=2)):
88
scheduler = BackgroundScheduler()
99
scheduler.start()
1010
scheduler.add_job(
11-
func=task_run_policies,
12-
trigger=IntervalTrigger(minutes=20),
13-
name='Schedule task "cleanup time-series" every 20 minutes',
14-
replace_existing=True
11+
func=run_policies,
12+
trigger=trigger,
13+
replace_existing=True,
14+
name="Clean-up Prometheus time-series"
1515
)
1616
atexit.register(lambda: scheduler.shutdown())

ui/homepage/index.html

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
<meta charset="UTF-8">
55
<meta name="viewport" content="width=device-width, initial-scale=1.0">
66
<title>Extended HTTP API service for Prometheus</title>
7+
<link rel="icon" type="image/png" href="https://raw.githubusercontent.com/hayk96/prometheus-api/main/docs/images/logo.png">
78
<style>
89
body, h1, ul, li, a {
910
margin: 0;

ui/metrics-management/index.html

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
<meta name="viewport" content="width=device-width, initial-scale=1.0">
66
<title>Metrics Management</title>
77
<link rel="stylesheet" href="/metrics-management/style.css">
8+
<link rel="icon" type="image/png" href="https://raw.githubusercontent.com/hayk96/prometheus-api/main/docs/images/logo.png">
89
</head>
910
<body>
1011
<div id="sidebar" class="sidebar">

ui/rules-management/index.html

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
<head>
44
<meta charset="UTF-8">
55
<meta name="viewport" content="width=device-width, initial-scale=1.0">
6-
<title>Edit Rule</title>
6+
<title>Rules Management</title>
77
<link rel="stylesheet" href="/rules-management/style.css">
8+
<link rel="icon" type="image/png" href="https://raw.githubusercontent.com/hayk96/prometheus-api/main/docs/images/logo.png">
89
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.2/codemirror.min.css">
910
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.2/theme/monokai.min.css">
1011
</head>

0 commit comments

Comments
 (0)