diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml new file mode 100644 index 0000000000..053a8733dc --- /dev/null +++ b/.github/workflows/dependency-review.yml @@ -0,0 +1,22 @@ +# Dependency Review Action +# +# This Action will scan dependency manifest files that change as part of a Pull Request, surfacing known-vulnerable versions of the packages declared or updated in the PR. Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable packages will be blocked from merging. +# +# Source repository: https://github.com/actions/dependency-review-action +# Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement +name: 'Dependency Review' +on: [pull_request] + +permissions: + contents: read + +jobs: + dependency-review: + runs-on: ubuntu-latest + steps: + - name: 'Checkout Repository' + uses: actions/checkout@v3 + - name: 'Dependency Review' + with: + fail-on-severity: high + uses: actions/dependency-review-action@v2 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 54ff60451b..80e07b7522 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,122 +1,331 @@ -# How to Contribute +# Contributing Guidelines -There are primarily 2 areas in which you can contribute in SigNoz +## Welcome to SigNoz Contributing section πŸŽ‰ -- Frontend ( written in Typescript, React) -- Backend - ( Query Service - written in Go) +Hi there! We're thrilled that you'd like to contribute to this project, thank you for your interest. Whether it's a bug report, new feature, correction, or additional documentation, we greatly value feedback and contributions from our community. -Depending upon your area of expertise & interest, you can chose one or more to contribute. Below are detailed instructions to contribute in each area +Please read through this document before submitting any issues or pull requests to ensure we have all the necessary information to effectively respond to your bug report or contribution. -> Please note: If you want to work on an issue, please ask the maintainers to assign the issue to you before starting work on it. This would help us understand who is working on an issue and prevent duplicate work. πŸ™πŸ» +- We accept contributions made to the [SigNoz `develop` branch]() +- Find all SigNoz Docker Hub images here + - [signoz/frontend](https://hub.docker.com/r/signoz/frontend) + - [signoz/query-service](https://hub.docker.com/r/signoz/query-service) + - [signoz/otelcontribcol](https://hub.docker.com/r/signoz/otelcontribcol) -> If you just raise a PR, without the corresponding issue being assigned to you - it may not be accepted. +## Finding contributions to work on πŸ’¬ -# Develop Frontend +Looking at the existing issues is a great way to find something to contribute on. +Also, have a look at these [good first issues label](https://github.com/SigNoz/signoz/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) to start with. -Need to update [https://github.com/SigNoz/signoz/tree/main/frontend](https://github.com/SigNoz/signoz/tree/main/frontend) -### Contribute to Frontend with Docker installation of SigNoz +## Sections: +- [General Instructions](#1-general-instructions-) + - [For Creating Issue(s)](#11-for-creating-issues) + - [For Pull Requests(s)](#12-for-pull-requests) +- [How to Contribute](#2-how-to-contribute-%EF%B8%8F) +- [Develop Frontend](#3-develop-frontend-) + - [Contribute to Frontend with Docker installation of SigNoz](#31-contribute-to-frontend-with-docker-installation-of-signoz) + - [Contribute to Frontend without installing SigNoz backend](#32-contribute-to-frontend-without-installing-signoz-backend) +- [Contribute to Backend (Query-Service)](#4-contribute-to-backend-query-service-) + - [To run ClickHouse setup](#41-to-run-clickhouse-setup-recommended-for-local-development) +- [Contribute to SigNoz Helm Chart](#5-contribute-to-signoz-helm-chart-) + - [To run helm chart for local development](#51-to-run-helm-chart-for-local-development) +- [Other Ways to Contribute](#other-ways-to-contribute) -- `git clone https://github.com/SigNoz/signoz.git && cd signoz` -- comment out frontend service section at `deploy/docker/clickhouse-setup/docker-compose.yaml#L62` -- run `cd deploy` to move to deploy directory -- Install signoz locally without the frontend - - Add below configuration to query-service section at `docker/clickhouse-setup/docker-compose.yaml#L38` +# 1. General Instructions πŸ“ - ```docker +## 1.1 For Creating Issue(s) +Before making any significant changes and before filing a new issue, please check [existing open](https://github.com/SigNoz/signoz/issues?q=is%3Aopen+is%3Aissue), or [recently closed](https://github.com/SigNoz/signoz/issues?q=is%3Aissue+is%3Aclosed) issues to make sure somebody else hasn't already reported the issue. Please try to include as much information as you can. + +**Issue Types** - [Bug Report](https://github.com/SigNoz/signoz/issues/new?assignees=&labels=&template=bug_report.md&title=) | [Feature Request](https://github.com/SigNoz/signoz/issues/new?assignees=&labels=&template=feature_request.md&title=) | [Performance Issue Report](https://github.com/SigNoz/signoz/issues/new?assignees=&labels=&template=performance-issue-report.md&title=) | [Report a Security Vulnerability](https://github.com/SigNoz/signoz/security/policy) + +#### Details like these are incredibly useful: + +- **Requirement** - what kind of use case are you trying to solve? +- **Proposal** - what do you suggest to solve the problem or improve the existing + situation? +- Any open questions to address❓ + +#### If you are reporting a bug, details like these are incredibly useful: + +- A reproducible test case or series of steps. +- The version of our code being used. +- Any modifications you've made relevant to the bug🐞. +- Anything unusual about your environment or deployment. + +Discussing your proposed changes ahead of time will make the contribution +process smooth for everyone πŸ™Œ. + + **[`^top^`](#)** + +
+ +## 1.2 For Pull Request(s) + +Contributions via pull requests are much appreciated. Once the approach is agreed upon βœ…, make your changes and open a Pull Request(s). +Before sending us a pull request, please ensure that, + +- Fork the SigNoz repo on GitHub, clone it on your machine. +- Create a branch with your changes. +- You are working against the latest source on the `develop` branch. +- Modify the source; please focus only on the specific change you are contributing. +- Ensure local tests pass. +- Commit to your fork using clear commit messages. +- Send us a pull request, answering any default questions in the pull request interface. +- Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation +- Once you've pushed your commits to GitHub, make sure that your branch can be auto-merged (there are no merge conflicts). If not, on your computer, merge main into your branch, resolve any merge conflicts, make sure everything still runs correctly and passes all the tests, and then push up those changes. +- Once the change has been approved and merged, we will inform you in a comment. + + +GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and +[creating a pull request](https://help.github.com/articles/creating-a-pull-request/). + +**Note:** Unless your change is small, **please** consider submitting different Pull Rrequest(s): + +* 1️⃣ First PR should include the overall structure of the new component: + * Readme, configuration, interfaces or base classes, etc... + * This PR is usually trivial to review, so the size limit does not apply to + it. +* 2️⃣ Second PR should include the concrete implementation of the component. If the + size of this PR is larger than the recommended size, consider **splitting** βš”οΈ it into + multiple PRs. +* If there are multiple sub-component then ideally each one should be implemented as + a **separate** pull request. +* Last PR should include changes to **any user-facing documentation.** And should include + end-to-end tests if applicable. The component must be enabled + only after sufficient testing, and there is enough confidence in the + stability and quality of the component. + + +You can always reach out to `ankit@signoz.io` to understand more about the repo and product. We are very responsive over email and [SLACK](https://signoz.io/slack). + +### Pointers: +- If you find any **bugs** β†’ please create an [**issue.**](https://github.com/SigNoz/signoz/issues/new?assignees=&labels=&template=bug_report.md&title=) +- If you find anything **missing** in documentation β†’ you can create an issue with the label **`documentation`**. +- If you want to build any **new feature** β†’ please create an [issue with the label **`enhancement`**.](https://github.com/SigNoz/signoz/issues/new?assignees=&labels=&template=feature_request.md&title=) +- If you want to **discuss** something about the product, start a new [**discussion**.](https://github.com/SigNoz/signoz/discussions) + +
+ +### Conventions to follow when submitting Commits and Pull Request(s). + +We try to follow [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/), more specifically the commits and PRs **should have type specifiers** prefixed in the name. [This](https://www.conventionalcommits.org/en/v1.0.0/#specification) should give you a better idea. + +e.g. If you are submitting a fix for an issue in frontend, the PR name should be prefixed with **`fix(FE):`** + +- Follow [GitHub Flow](https://guides.github.com/introduction/flow/) guidelines for your contribution flows. + +- Feel free to ping us on [`#contributing`](https://signoz-community.slack.com/archives/C01LWQ8KS7M) or [`#contributing-frontend`](https://signoz-community.slack.com/archives/C027134DM8B) on our slack community if you need any help on this :) + + **[`^top^`](#)** + +
+ +# 2. How to Contribute πŸ™‹πŸ»β€β™‚οΈ + +#### There are primarily 2 areas in which you can contribute to SigNoz + +- [**Frontend**](#3-develop-frontend-) (Written in Typescript, React) +- [**Backend**](#4-contribute-to-backend-query-service-) (Query Service, written in Go) + +Depending upon your area of expertise & interest, you can choose one or more to contribute. Below are detailed instructions to contribute in each area. + +**Please note:** If you want to work on an issue, please ask the maintainers to assign the issue to you before starting work on it. This would help us understand who is working on an issue and prevent duplicate work. πŸ™πŸ» + +⚠️ If you just raise a PR, without the corresponding issue being assigned to you - it may not be accepted. + + **[`^top^`](#)** + +
+ +# 3. Develop Frontend 🌚 + +**Need to Update: [https://github.com/SigNoz/signoz/tree/develop/frontend](https://github.com/SigNoz/signoz/tree/develop/frontend)** + +Also, have a look at [Frontend README.md](https://github.com/SigNoz/signoz/blob/develop/frontend/README.md) sections for more info on how to setup SigNoz frontend locally (with and without Docker). + +## 3.1 Contribute to Frontend with Docker installation of SigNoz + +- Clone the SigNoz repository and cd into signoz directory, + ``` + git clone https://github.com/SigNoz/signoz.git && cd signoz + ``` +- Comment out `frontend` service section at [`deploy/docker/clickhouse-setup/docker-compose.yaml#L68`](https://github.com/SigNoz/signoz/blob/develop/deploy/docker/clickhouse-setup/docker-compose.yaml#L68) + +![develop-frontend](https://user-images.githubusercontent.com/52788043/179009217-6692616b-17dc-4d27-b587-9d007098d739.jpeg) + + +- run `cd deploy` to move to deploy directory, +- Install signoz locally **without** the frontend, + - Add / Uncomment the below configuration to query-service section at [`deploy/docker/clickhouse-setup/docker-compose.yaml#L47`](https://github.com/SigNoz/signoz/blob/develop/deploy/docker/clickhouse-setup/docker-compose.yaml#L47) + ``` ports: - "8080:8080" ``` - - If you are using x86_64 processors (All Intel/AMD processors) run `sudo docker-compose -f docker/clickhouse-setup/docker-compose.yaml up -d` - - If you are on arm64 processors (Apple M1 Macbooks) run `sudo docker-compose -f docker/clickhouse-setup/docker-compose.arm.yaml up -d` -- `cd ../frontend` and change baseURL to `http://localhost:8080` in file `src/constants/env.ts` -- `yarn install` -- `yarn dev` +query service + + - Next run, + ``` + sudo docker-compose -f docker/clickhouse-setup/docker-compose.yaml up -d + ``` +- `cd ../frontend` and change baseURL in file [`frontend/src/constants/env.ts#L2`](https://github.com/SigNoz/signoz/blob/develop/frontend/src/constants/env.ts#L2) and for that, you need to create a `.env` file in the `frontend` directory with the following environment variable (`FRONTEND_API_ENDPOINT`) matching your configuration. -> Notes for Maintainers/Contributors who will change Line Numbers of Frontend & Query-Section. Please Update Line Numbers in `./scripts/commentLinesForSetup.sh` + If you have backend api exposed via frontend nginx: + ``` + FRONTEND_API_ENDPOINT=http://localhost:3301 + ``` + If not: + ``` + FRONTEND_API_ENDPOINT=http://localhost:8080 + ``` -### Contribute to Frontend without installing SigNoz backend +- Next, + ``` + yarn install + yarn dev + ``` -If you don't want to install SigNoz backend just for doing frontend development, we can provide you with test environments which you can use as the backend. Please ping us in #contributing channel in our [slack community](https://signoz.io/slack) and we will DM you with `` +### Important Notes: +The Maintainers / Contributors who will change Line Numbers of `Frontend` & `Query-Section`, please update line numbers in [`/.scripts/commentLinesForSetup.sh`](https://github.com/SigNoz/signoz/blob/develop/.scripts/commentLinesForSetup.sh) -- `git clone https://github.com/SigNoz/signoz.git && cd signoz/frontend` -- Create a file `.env` with `FRONTEND_API_ENDPOINT=` -- `yarn install` -- `yarn dev` + **[`^top^`](#)** -**_Frontend should now be accessible at `http://localhost:3301/application`_** +## 3.2 Contribute to Frontend without installing SigNoz backend -# Contribute to Query-Service +If you don't want to install the SigNoz backend just for doing frontend development, we can provide you with test environments that you can use as the backend. -Need to update [https://github.com/SigNoz/signoz/tree/main/pkg/query-service](https://github.com/SigNoz/signoz/tree/main/pkg/query-service) +- Clone the SigNoz repository and cd into signoz/frontend directory, + ``` + git clone https://github.com/SigNoz/signoz.git && cd signoz/frontend + ```` +- Create a file `.env` in the `frontend` directory with `FRONTEND_API_ENDPOINT=` +- Next, + ``` + yarn install + yarn dev + ``` -### To run ClickHouse setup (recommended for local development) +Please ping us in the [`#contributing`](https://signoz-community.slack.com/archives/C01LWQ8KS7M) channel or ask `@Prashant Shahi` in our [Slack Community](https://signoz.io/slack) and we will DM you with ``. -- git clone https://github.com/SigNoz/signoz.git -- run `cd signoz` to move to signoz directory -- run `sudo make dev-setup` to configure local setup to run query-service -- comment out frontend service section at `docker/clickhouse-setup/docker-compose.yaml` -- comment out query-service section at `docker/clickhouse-setup/docker-compose.yaml` -- add below configuration to clickhouse section at `docker/clickhouse-setup/docker-compose.yaml` -```docker - expose: - - 9000 - ports: - - 9001:9000 +**Frontend should now be accessible at** [`http://localhost:3301/application`](http://localhost:3301/application) + + **[`^top^`](#)** + +
+ +# 4. Contribute to Backend (Query-Service) πŸŒ‘ + +**Need to Update:** [**https://github.com/SigNoz/signoz/tree/develop/pkg/query-service**](https://github.com/SigNoz/signoz/tree/develop/pkg/query-service) + +## 4.1 To run ClickHouse setup (recommended for local development) + +- Clone the SigNoz repository and cd into signoz directory, + ``` + git clone https://github.com/SigNoz/signoz.git && cd signoz + ``` +- run `sudo make dev-setup` to configure local setup to run query-service, +- Comment out `frontend` service section at [`deploy/docker/clickhouse-setup/docker-compose.yaml#L68`](https://github.com/SigNoz/signoz/blob/develop/deploy/docker/clickhouse-setup/docker-compose.yaml#L68) +develop-frontend + +- Comment out `query-service` section at [`deploy/docker/clickhouse-setup/docker-compose.yaml#L41`,](https://github.com/SigNoz/signoz/blob/develop/deploy/docker/clickhouse-setup/docker-compose.yaml#L41) +Screenshot 2022-07-14 at 22 48 07 + +- add below configuration to `clickhouse` section at [`deploy/docker/clickhouse-setup/docker-compose.yaml`,](https://github.com/SigNoz/signoz/blob/develop/deploy/docker/clickhouse-setup/docker-compose.yaml) + ``` + ports: + - 9001:9000 + ``` +Screenshot 2022-07-14 at 22 50 37 + +- run `cd pkg/query-service/` to move to `query-service` directory, +- Then, you need to create a `.env` file with the following environment variable + ``` + SIGNOZ_LOCAL_DB_PATH="./signoz.db" + ``` +to set your local environment with the right `RELATIONAL_DATASOURCE_PATH` as mentioned in [`./constants/constants.go#L38`,](https://github.com/SigNoz/signoz/blob/develop/pkg/query-service/constants/constants.go#L38) + +- Now, install SigNoz locally **without** the `frontend` and `query-service`, + - If you are using `x86_64` processors (All Intel/AMD processors) run `sudo make run-x86` + - If you are on `arm64` processors (Apple M1 Macs) run `sudo make run-arm` + +#### Run locally, ``` - -- run `cd pkg/query-service/` to move to query-service directory -- Open ./constants/constants.go - - Replace ```const RELATIONAL_DATASOURCE_PATH = "/var/lib/signoz/signoz.db"``` \ - with ```const RELATIONAL_DATASOURCE_PATH = "./signoz.db".``` - -- Install signoz locally without the frontend and query-service - - If you are using x86_64 processors (All Intel/AMD processors) run `sudo make run-x86` - - If you are on arm64 processors (Apple M1 Macbooks) run `sudo make run-arm` - -#### Run locally -```console ClickHouseUrl=tcp://localhost:9001 STORAGE=clickhouse go run main.go ``` -> Notes for Maintainers/Contributors who will change Line Numbers of Frontend & Query-Section. Please Update Line Numbers in `./scripts/commentLinesForSetup.sh` +#### Build and Run locally +``` +cd pkg/query-service +go build -o build/query-service main.go +ClickHouseUrl=tcp://localhost:9001 STORAGE=clickhouse build/query-service +``` -**_Query Service should now be available at `http://localhost:8080`_** +#### Docker Images +The docker images of query-service is available at https://hub.docker.com/r/signoz/query-service -> If you want to see how, frontend plays with query service, you can run frontend also in you local env with the baseURL changed to `http://localhost:8080` in file `src/constants/env.ts` as the query-service is now running at port `8080` +``` +docker pull signoz/query-service +``` + +``` +docker pull signoz/query-service:latest +``` + +``` +docker pull signoz/query-service:develop +``` + +### Important Note: +The Maintainers / Contributors who will change Line Numbers of `Frontend` & `Query-Section`, please update line numbers in [`/.scripts/commentLinesForSetup.sh`](https://github.com/SigNoz/signoz/blob/develop/.scripts/commentLinesForSetup.sh) + + + +**Query Service should now be available at** [`http://localhost:8080`](http://localhost:8080) + +If you want to see how the frontend plays with query service, you can run the frontend also in your local env with the baseURL changed to `http://localhost:8080` in file [`frontend/src/constants/env.ts`](https://github.com/SigNoz/signoz/blob/develop/frontend/src/constants/env.ts) as the `query-service` is now running at port `8080`. ---- +> To use it on your forked repo, edit the 'Open in Gitpod' button URL to `https://gitpod.io/#https://github.com//signoz` --> -# Contribute to SigNoz Helm Chart + **[`^top^`](#)** + +
-Need to update [https://github.com/SigNoz/charts](https://github.com/SigNoz/charts). +# 5. Contribute to SigNoz Helm Chart πŸ“Š -### To run helm chart for local development +**Need to Update: [https://github.com/SigNoz/charts](https://github.com/SigNoz/charts).** -- run `git clone https://github.com/SigNoz/charts.git` followed by `cd charts` -- it is recommended to use lightweight kubernetes (k8s) cluster for local development: +## 5.1 To run helm chart for local development + +- Clone the SigNoz repository and cd into charts directory, + ``` + git clone https://github.com/SigNoz/charts.git && cd charts + ``` +- It is recommended to use lightweight kubernetes (k8s) cluster for local development: - [kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) - [k3d](https://k3d.io/#installation) - [minikube](https://minikube.sigs.k8s.io/docs/start/) -- create a k8s cluster and make sure `kubectl` points to the locally created k8s cluster -- run `make dev-install` to install SigNoz chart with `my-release` release name in `platform` namespace. -- run `kubectl -n platform port-forward svc/my-release-signoz-frontend 3301:3301` to make SigNoz UI available at [localhost:3301](http://localhost:3301) +- create a k8s cluster and make sure `kubectl` points to the locally created k8s cluster, +- run `make dev-install` to install SigNoz chart with `my-release` release name in `platform` namespace, +- next run, + ``` + kubectl -n platform port-forward svc/my-release-signoz-frontend 3301:3301 + ``` +to make SigNoz UI available at [localhost:3301](http://localhost:3301) -**To install HotROD sample app:** +**5.1.1 To install the HotROD sample app:** ```bash curl -sL https://github.com/SigNoz/signoz/raw/main/sample-apps/hotrod/hotrod-install.sh \ | HELM_RELEASE=my-release SIGNOZ_NAMESPACE=platform bash ``` -**To load data with HotROD sample app:** +**5.1.2 To load data with the HotROD sample app:** ```bash kubectl -n sample-application run strzal --image=djbingham/curl \ @@ -124,7 +333,7 @@ kubectl -n sample-application run strzal --image=djbingham/curl \ 'locust_count=6' -F 'hatch_rate=2' http://locust-master:8089/swarm ``` -**To stop the load generation:** +**5.1.3 To stop the load generation:** ```bash kubectl -n sample-application run strzal --image=djbingham/curl \ @@ -132,59 +341,32 @@ kubectl -n sample-application run strzal --image=djbingham/curl \ http://locust-master:8089/stop ``` -**To delete HotROD sample app:** +**5.1.4 To delete the HotROD sample app:** ```bash curl -sL https://github.com/SigNoz/signoz/raw/main/sample-apps/hotrod/hotrod-delete.sh \ | HOTROD_NAMESPACE=sample-application bash ``` + **[`^top^`](#)** + --- -## General Instructions +## Other Ways to Contribute -**Before making any significant changes, please open an issue**. Each issue -should describe the following: +There are many other ways to get involved with the community and to participate in this project: -* Requirement - what kind of use case are you trying to solve? -* Proposal - what do you suggest to solve the problem or improve the existing - situation? -* Any open questions to address - -Discussing your proposed changes ahead of time will make the contribution -process smooth for everyone. Once the approach is agreed upon, make your changes -and open a pull request(s). Unless your change is small, Please consider submitting different PRs: - -* First PR should include the overall structure of the new component: - * Readme, configuration, interfaces or base classes etc... - * This PR is usually trivial to review, so the size limit does not apply to - it. -* Second PR should include the concrete implementation of the component. If the - size of this PR is larger than the recommended size consider splitting it in - multiple PRs. -* If there are multiple sub-component then ideally each one should be implemented as - a separate pull request. -* Last PR should include changes to any user facing documentation. And should include - end to end tests if applicable. The component must be enabled - only after sufficient testing, and there is enough confidence in the - stability and quality of the component. +- Use the product, submitting GitHub issues when a problem is found. +- Help code review pull requests and participate in issue threads. +- Submit a new feature request as an issue. +- Help answer questions on forums such as Stack Overflow and [SigNoz Community Slack Channel](https://signoz.io/slack). +- Tell others about the project on Twitter, your blog, etc. -You can always reach out to `ankit@signoz.io` to understand more about the repo and product. We are very responsive over email and [slack](https://signoz.io/slack). +## License -- If you find any bugs, please create an issue -- If you find anything missing in documentation, you can create an issue with label **documentation** -- If you want to build any new feature, please create an issue with label `enhancement` -- If you want to discuss something about the product, start a new [discussion](https://github.com/SigNoz/signoz/discussions) +By contributing to SigNoz, you agree that your contributions will be licensed under its MIT license. -### Conventions to follow when submitting commits, PRs +Again, Feel free to ping us on [`#contributing`](https://signoz-community.slack.com/archives/C01LWQ8KS7M) or [`#contributing-frontend`](https://signoz-community.slack.com/archives/C027134DM8B) on our slack community if you need any help on this :) -1. We try to follow https://www.conventionalcommits.org/en/v1.0.0/ - -More specifically the commits and PRs should have type specifiers prefixed in the name. [This](https://www.conventionalcommits.org/en/v1.0.0/#specification) should give you a better idea. - -e.g. If you are submitting a fix for an issue in frontend - PR name should be prefixed with `fix(FE):` - -2. Follow [GitHub Flow](https://guides.github.com/introduction/flow/) guidelines for your contribution flows - -3. Feel free to ping us on `#contributing` or `#contributing-frontend` on our slack community if you need any help on this :) +Thank You! diff --git a/Makefile b/Makefile index ac93167fa7..7aaa3a41d6 100644 --- a/Makefile +++ b/Makefile @@ -82,15 +82,9 @@ dev-setup: run-x86: @docker-compose -f $(STANDALONE_DIRECTORY)/docker-compose.yaml up -d -run-arm: - @docker-compose -f $(STANDALONE_DIRECTORY)/docker-compose.arm.yaml up -d - down-x86: @docker-compose -f $(STANDALONE_DIRECTORY)/docker-compose.yaml down -v -down-arm: - @docker-compose -f $(STANDALONE_DIRECTORY)/docker-compose.arm.yaml down -v - clear-standalone-data: @docker run --rm -v "$(PWD)/$(STANDALONE_DIRECTORY)/data:/pwd" busybox \ sh -c "cd /pwd && rm -rf alertmanager/* clickhouse/* signoz/*" diff --git a/deploy/docker-swarm/clickhouse-setup/clickhouse-config.xml b/deploy/docker-swarm/clickhouse-setup/clickhouse-config.xml index 3bb26a3a36..4a6a82b8af 100644 --- a/deploy/docker-swarm/clickhouse-setup/clickhouse-config.xml +++ b/deploy/docker-swarm/clickhouse-setup/clickhouse-config.xml @@ -22,7 +22,7 @@ [1]: https://github.com/pocoproject/poco/blob/poco-1.9.4-release/Foundation/include/Poco/Logger.h#L105-L114 --> - trace + information /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.err.log - trace + information /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.err.log - - - - trace - /var/log/clickhouse-server/clickhouse-server.log - /var/log/clickhouse-server/clickhouse-server.err.log - - 1000M - 10 - - - - - - - - - - - - - - - - - - 8123 - - - 9000 - - - 9004 - - - 9005 - - - - - - - - - - - - 9009 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 4096 - - - 3 - - - - - false - - - /path/to/ssl_cert_file - /path/to/ssl_key_file - - - false - - - /path/to/ssl_ca_cert_file - - - none - - - 0 - - - -1 - -1 - - - false - - - - - - - /etc/clickhouse-server/server.crt - /etc/clickhouse-server/server.key - - - none - true - true - sslv2,sslv3 - true - - - - true - true - sslv2,sslv3 - true - - - - RejectCertificateHandler - - - - - - - - - 100 - - - 0 - - - - 10000 - - - - - - 0.9 - - - 4194304 - - - 0 - - - - - - 8589934592 - - - 5368709120 - - - - 1000 - - - 134217728 - - - 10000 - - - /var/lib/clickhouse/ - - - /var/lib/clickhouse/tmp/ - - - - ` - - - - - - /var/lib/clickhouse/user_files/ - - - - - - - - - - - - - users.xml - - - - /var/lib/clickhouse/access/ - - - - - - - default - - - - - - - - - - - - default - - - - - - - - - true - - - false - - ' | sed -e 's|.*>\(.*\)<.*|\1|') - wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge_$PKG_VER-1_all.deb - apt install --no-install-recommends -f ./clickhouse-jdbc-bridge_$PKG_VER-1_all.deb - clickhouse-jdbc-bridge & - - * [CentOS/RHEL] - export MVN_URL=https://repo1.maven.org/maven2/ru/yandex/clickhouse/clickhouse-jdbc-bridge - export PKG_VER=$(curl -sL $MVN_URL/maven-metadata.xml | grep '' | sed -e 's|.*>\(.*\)<.*|\1|') - wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm - yum localinstall -y clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm - clickhouse-jdbc-bridge & - - Please refer to https://github.com/ClickHouse/clickhouse-jdbc-bridge#usage for more information. - ]]> - - - - - - - - - - - - - - - - localhost - 9000 - - - - - - - - false - - 127.0.0.1 - 9000 - - - 127.0.0.2 - 9000 - - - 127.0.0.3 - 9000 - - - - - - - - localhost - 9000 - - - - - localhost - 9000 - - - - - - - 127.0.0.1 - 9000 - - - - - 127.0.0.2 - 9000 - - - - - - true - - 127.0.0.1 - 9000 - - - - true - - 127.0.0.2 - 9000 - - - - - - - localhost - 9440 - 1 - - - - - - - localhost - 9000 - - - - - localhost - 1 - - - - - - - - - - - - - - - - - - - - - - - - 3600 - - - - 3600 - - - 60 - - - - - - - - - - - - - system - query_log
- - toYYYYMM(event_date) - - - - - - 7500 -
- - - - system - trace_log
- - toYYYYMM(event_date) - 7500 -
- - - - system - query_thread_log
- toYYYYMM(event_date) - 7500 -
- - - - system - query_views_log
- toYYYYMM(event_date) - 7500 -
- - - - system - part_log
- toYYYYMM(event_date) - 7500 -
- - - - - - system - metric_log
- 7500 - 1000 -
- - - - system - asynchronous_metric_log
- - 7000 -
- - - - - - engine MergeTree - partition by toYYYYMM(finish_date) - order by (finish_date, finish_time_us, trace_id) - - system - opentelemetry_span_log
- 7500 -
- - - - - system - crash_log
- - - 1000 -
- - - - - - - system - processors_profile_log
- - toYYYYMM(event_date) - 7500 -
- - - - - - - - - *_dictionary.xml - - - *_function.xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - /clickhouse/task_queue/ddl - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - click_cost - any - - 0 - 3600 - - - 86400 - 60 - - - - max - - 0 - 60 - - - 3600 - 300 - - - 86400 - 3600 - - - - - - /var/lib/clickhouse/format_schemas/ - - - - - hide encrypt/decrypt arguments - ((?:aes_)?(?:encrypt|decrypt)(?:_mysql)?)\s*\(\s*(?:'(?:\\'|.)+'|.*?)\s*\) - - \1(???) - - - - - - - - - - false - - false - - - https://6f33034cfe684dd7a3ab9875e57b1c8d@o388870.ingest.sentry.io/5226277 - - - - - - - - - - - 268435456 - true - -
diff --git a/deploy/docker/clickhouse-setup/docker-compose.yaml b/deploy/docker/clickhouse-setup/docker-compose.yaml index f8c2954446..3b3403a480 100644 --- a/deploy/docker/clickhouse-setup/docker-compose.yaml +++ b/deploy/docker/clickhouse-setup/docker-compose.yaml @@ -39,7 +39,7 @@ services: # Notes for Maintainers/Contributors who will change Line Numbers of Frontend & Query-Section. Please Update Line Numbers in `./scripts/commentLinesForSetup.sh` & `./CONTRIBUTING.md` query-service: - image: signoz/query-service:0.9.2 + image: signoz/query-service:0.10.0 container_name: query-service command: ["-config=/root/config/prometheus.yml"] # ports: @@ -66,7 +66,7 @@ services: condition: service_healthy frontend: - image: signoz/frontend:0.9.2 + image: signoz/frontend:0.10.0 container_name: frontend restart: on-failure depends_on: @@ -78,20 +78,24 @@ services: - ../common/nginx-config.conf:/etc/nginx/conf.d/default.conf otel-collector: - image: signoz/otelcontribcol:0.45.1-1.0 + image: signoz/otelcontribcol:0.45.1-1.1 command: ["--config=/etc/otel-collector-config.yaml"] volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml + environment: + - OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux ports: + # - "1777:1777" # pprof extension - "4317:4317" # OTLP gRPC receiver - "4318:4318" # OTLP HTTP receiver - # - "8889:8889" # Prometheus metrics exposed by the agent - # - "13133:13133" # health_check - # - "14268:14268" # Jaeger receiver + # - "8888:8888" # OtelCollector internal metrics + # - "8889:8889" # signoz spanmetrics exposed by the agent + # - "9411:9411" # Zipkin port + # - "13133:13133" # health check extension + # - "14250:14250" # Jaeger gRPC + # - "14268:14268" # Jaeger thrift HTTP # - "55678:55678" # OpenCensus receiver - # - "55679:55679" # zpages extension - # - "55680:55680" # OTLP gRPC legacy receiver - # - "55681:55681" # OTLP HTTP legacy receiver + # - "55679:55679" # zPages extension mem_limit: 2000m restart: on-failure depends_on: @@ -99,10 +103,15 @@ services: condition: service_healthy otel-collector-metrics: - image: signoz/otelcontribcol:0.45.1-1.0 + image: signoz/otelcontribcol:0.45.1-1.1 command: ["--config=/etc/otel-collector-metrics-config.yaml"] volumes: - ./otel-collector-metrics-config.yaml:/etc/otel-collector-metrics-config.yaml + # ports: + # - "1777:1777" # pprof extension + # - "8888:8888" # OtelCollector internal metrics + # - "13133:13133" # Health check extension + # - "55679:55679" # zPages extension restart: on-failure depends_on: clickhouse: diff --git a/deploy/docker/clickhouse-setup/otel-collector-config.yaml b/deploy/docker/clickhouse-setup/otel-collector-config.yaml index e363f015df..0717cf4c45 100644 --- a/deploy/docker/clickhouse-setup/otel-collector-config.yaml +++ b/deploy/docker/clickhouse-setup/otel-collector-config.yaml @@ -1,25 +1,36 @@ receivers: + opencensus: + endpoint: 0.0.0.0:55678 otlp/spanmetrics: protocols: grpc: - endpoint: "localhost:12345" + endpoint: localhost:12345 otlp: protocols: grpc: + endpoint: 0.0.0.0:4317 http: + endpoint: 0.0.0.0:4318 jaeger: protocols: grpc: + endpoint: 0.0.0.0:14250 thrift_http: + endpoint: 0.0.0.0:14268 + # thrift_compact: + # endpoint: 0.0.0.0:6831 + # thrift_binary: + # endpoint: 0.0.0.0:6832 hostmetrics: collection_interval: 60s scrapers: - cpu: - load: - memory: - disk: - filesystem: - network: + cpu: {} + load: {} + memory: {} + disk: {} + filesystem: {} + network: {} + processors: batch: send_batch_size: 10000 @@ -49,9 +60,20 @@ processors: # num_workers: 4 # queue_size: 100 # retry_on_failure: true + resourcedetection: + # Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels. + detectors: [env, system] # include ec2 for AWS, gce for GCP and azure for Azure. + timeout: 2s + override: false + extensions: - health_check: {} - zpages: {} + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + pprof: + endpoint: 0.0.0.0:1777 + exporters: clickhousetraces: datasource: tcp://clickhouse:9000/?database=signoz_traces @@ -60,18 +82,30 @@ exporters: resource_to_telemetry_conversion: enabled: true prometheus: - endpoint: "0.0.0.0:8889" + endpoint: 0.0.0.0:8889 + # logging: {} + service: - extensions: [health_check, zpages] + telemetry: + metrics: + address: 0.0.0.0:8888 + extensions: + - health_check + - zpages + - pprof pipelines: traces: receivers: [jaeger, otlp] processors: [signozspanmetrics/prometheus, batch] exporters: [clickhousetraces] metrics: - receivers: [otlp, hostmetrics] + receivers: [otlp] processors: [batch] exporters: [clickhousemetricswrite] + metrics/hostmetrics: + receivers: [hostmetrics] + processors: [resourcedetection, batch] + exporters: [clickhousemetricswrite] metrics/spanmetrics: receivers: [otlp/spanmetrics] exporters: [prometheus] diff --git a/deploy/docker/clickhouse-setup/otel-collector-metrics-config.yaml b/deploy/docker/clickhouse-setup/otel-collector-metrics-config.yaml index 26c629ba60..fdc5830f57 100644 --- a/deploy/docker/clickhouse-setup/otel-collector-metrics-config.yaml +++ b/deploy/docker/clickhouse-setup/otel-collector-metrics-config.yaml @@ -3,15 +3,28 @@ receivers: protocols: grpc: http: - - # Data sources: metrics prometheus: config: scrape_configs: + # otel-collector internal metrics - job_name: "otel-collector" scrape_interval: 60s static_configs: - - targets: ["otel-collector:8889"] + - targets: + - otel-collector:8888 + # otel-collector-metrics internal metrics + - job_name: "otel-collector-metrics" + scrape_interval: 60s + static_configs: + - targets: + - localhost:8888 + # SigNoz span metrics + - job_name: "signozspanmetrics-collector" + scrape_interval: 60s + static_configs: + - targets: + - otel-collector:8889 + processors: batch: send_batch_size: 10000 @@ -32,17 +45,29 @@ processors: # num_workers: 4 # queue_size: 100 # retry_on_failure: true + extensions: - health_check: {} - zpages: {} + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + pprof: + endpoint: 0.0.0.0:1777 + exporters: clickhousemetricswrite: endpoint: tcp://clickhouse:9000/?database=signoz_metrics service: - extensions: [health_check, zpages] + telemetry: + metrics: + address: 0.0.0.0:8888 + extensions: + - health_check + - zpages + - pprof pipelines: metrics: - receivers: [otlp, prometheus] + receivers: [prometheus] processors: [batch] exporters: [clickhousemetricswrite] diff --git a/deploy/docker/clickhouse-setup/users.xml b/deploy/docker/clickhouse-setup/users.xml deleted file mode 100644 index f18562071d..0000000000 --- a/deploy/docker/clickhouse-setup/users.xml +++ /dev/null @@ -1,123 +0,0 @@ - - - - - - - - - - 10000000000 - - - random - - - - - 1 - - - - - - - - - - - - - ::/0 - - - - default - - - default - - - - - - - - - - - - - - 3600 - - - 0 - 0 - 0 - 0 - 0 - - - - diff --git a/frontend/package.json b/frontend/package.json index f93bc9684c..868e95dce7 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -13,8 +13,9 @@ "jest:coverage": "jest --coverage", "jest:watch": "jest --watch", "postinstall": "is-ci || yarn husky:configure", - "playwright": "playwright test --config=./playwright.config.ts", + "playwright": "NODE_ENV=testing playwright test --config=./playwright.config.ts", "playwright:local:debug": "PWDEBUG=console yarn playwright --headed --browser=chromium", + "playwright:codegen:local":"playwright codegen http://localhost:3301", "husky:configure": "cd .. && husky install frontend/.husky && cd frontend && chmod ug+x .husky/*", "commitlint": "commitlint --edit $1" }, @@ -43,6 +44,7 @@ "babel-preset-react-app": "^10.0.0", "chart.js": "^3.4.0", "chartjs-adapter-date-fns": "^2.0.0", + "chartjs-plugin-annotation": "^1.4.0", "color": "^4.2.1", "cross-env": "^7.0.3", "css-loader": "4.3.0", @@ -81,6 +83,7 @@ "style-loader": "1.3.0", "styled-components": "^5.2.1", "terser-webpack-plugin": "^5.2.5", + "timestamp-nano": "^1.0.0", "ts-node": "^10.2.1", "tsconfig-paths-webpack-plugin": "^3.5.1", "typescript": "^4.0.5", diff --git a/frontend/playwright.config.ts b/frontend/playwright.config.ts index 98fccbcb7f..6733c67536 100644 --- a/frontend/playwright.config.ts +++ b/frontend/playwright.config.ts @@ -16,6 +16,8 @@ const config: PlaywrightTestConfig = { updateSnapshots: 'all', fullyParallel: false, quiet: true, + testMatch: ['**/*.spec.ts'], + reporter: process.env.CI ? 'github' : 'list', }; export default config; diff --git a/frontend/public/locales/en-GB/alerts.json b/frontend/public/locales/en-GB/alerts.json new file mode 100644 index 0000000000..e67bd35273 --- /dev/null +++ b/frontend/public/locales/en-GB/alerts.json @@ -0,0 +1,85 @@ +{ + "preview_chart_unexpected_error": "An unexpeced error occurred updating the chart, please check your query.", + "preview_chart_threshold_label": "Threshold", + "placeholder_label_key_pair": "Click here to enter a label (key value pairs)", + "button_yes": "Yes", + "button_no": "No", + "remove_label_confirm": "This action will remove all the labels. Do you want to proceed?", + "remove_label_success": "Labels cleared", + "alert_form_step1": "Step 1 - Define the metric", + "alert_form_step2": "Step 2 - Define Alert Conditions", + "alert_form_step3": "Step 3 - Alert Configuration", + "metric_query_max_limit": "Can not create query. You can create maximum of 5 queries", + "confirm_save_title": "Save Changes", + "confirm_save_content_part1": "Your alert built with", + "confirm_save_content_part2": "query will be saved. Press OK to confirm.", + "unexpected_error": "Sorry, an unexpected error occurred. Please contact your admin", + "rule_created": "Rule created successfully", + "rule_edited": "Rule edited successfully", + "expression_missing": "expression is missing in {{where}}", + "metricname_missing": "metric name is missing in {{where}}", + "condition_required": "at least one metric condition is required", + "alertname_required": "alert name is required", + "promql_required": "promql expression is required when query format is set to PromQL", + "button_savechanges": "Save Rule", + "button_createrule": "Create Rule", + "button_returntorules": "Return to rules", + "button_cancelchanges": "Cancel", + "button_discard": "Discard", + "text_condition1": "Send a notification when the metric is", + "text_condition2": "the threshold", + "text_condition3": "during the last", + "option_5min": "5 mins", + "option_10min": "10 mins", + "option_15min": "15 mins", + "option_60min": "60 mins", + "option_4hours": "4 hours", + "option_24hours": "24 hours", + "field_threshold": "Alert Threshold", + "option_allthetimes": "all the times", + "option_atleastonce": "at least once", + "option_onaverage": "on average", + "option_intotal": "in total", + "option_above": "above", + "option_below": "below", + "option_equal": "is equal to", + "option_notequal": "not equal to", + "button_query": "Query", + "button_formula": "Formula", + "tab_qb": "Query Builder", + "tab_promql": "PromQL", + "title_confirm": "Confirm", + "button_ok": "Yes", + "button_cancel": "No", + "field_promql_expr": "PromQL Expression", + "field_alert_name": "Alert Name", + "field_alert_desc": "Alert Description", + "field_labels": "Labels", + "field_severity": "Severity", + "option_critical": "Critical", + "option_error": "Error", + "option_warning": "Warning", + "option_info": "Info", + "user_guide_headline": "Steps to create an Alert", + "user_guide_qb_step1": "Step 1 - Define the metric", + "user_guide_qb_step1a": "Choose a metric which you want to create an alert on", + "user_guide_qb_step1b": "Filter it based on WHERE field or GROUPBY if needed", + "user_guide_qb_step1c": "Apply an aggregatiion function like COUNT, SUM, etc. or choose NOOP to plot the raw metric", + "user_guide_qb_step1d": "Create a formula based on Queries if needed", + "user_guide_qb_step2": "Step 2 - Define Alert Conditions", + "user_guide_qb_step2a": "Select the evaluation interval, threshold type and whether you want to alert above/below a value", + "user_guide_qb_step2b": "Enter the Alert threshold", + "user_guide_qb_step3": "Step 3 -Alert Configuration", + "user_guide_qb_step3a": "Set alert severity, name and descriptions", + "user_guide_qb_step3b": "Add tags to the alert in the Label field if needed", + "user_guide_pql_step1": "Step 1 - Define the metric", + "user_guide_pql_step1a": "Write a PromQL query for the metric", + "user_guide_pql_step1b": "Format the legends based on labels you want to highlight", + "user_guide_pql_step2": "Step 2 - Define Alert Conditions", + "user_guide_pql_step2a": "Select the threshold type and whether you want to alert above/below a value", + "user_guide_pql_step2b": "Enter the Alert threshold", + "user_guide_pql_step3": "Step 3 -Alert Configuration", + "user_guide_pql_step3a": "Set alert severity, name and descriptions", + "user_guide_pql_step3b": "Add tags to the alert in the Label field if needed", + "user_tooltip_more_help": "More details on how to create alerts" +} \ No newline at end of file diff --git a/frontend/public/locales/en-GB/rules.json b/frontend/public/locales/en-GB/rules.json new file mode 100644 index 0000000000..e67bd35273 --- /dev/null +++ b/frontend/public/locales/en-GB/rules.json @@ -0,0 +1,85 @@ +{ + "preview_chart_unexpected_error": "An unexpeced error occurred updating the chart, please check your query.", + "preview_chart_threshold_label": "Threshold", + "placeholder_label_key_pair": "Click here to enter a label (key value pairs)", + "button_yes": "Yes", + "button_no": "No", + "remove_label_confirm": "This action will remove all the labels. Do you want to proceed?", + "remove_label_success": "Labels cleared", + "alert_form_step1": "Step 1 - Define the metric", + "alert_form_step2": "Step 2 - Define Alert Conditions", + "alert_form_step3": "Step 3 - Alert Configuration", + "metric_query_max_limit": "Can not create query. You can create maximum of 5 queries", + "confirm_save_title": "Save Changes", + "confirm_save_content_part1": "Your alert built with", + "confirm_save_content_part2": "query will be saved. Press OK to confirm.", + "unexpected_error": "Sorry, an unexpected error occurred. Please contact your admin", + "rule_created": "Rule created successfully", + "rule_edited": "Rule edited successfully", + "expression_missing": "expression is missing in {{where}}", + "metricname_missing": "metric name is missing in {{where}}", + "condition_required": "at least one metric condition is required", + "alertname_required": "alert name is required", + "promql_required": "promql expression is required when query format is set to PromQL", + "button_savechanges": "Save Rule", + "button_createrule": "Create Rule", + "button_returntorules": "Return to rules", + "button_cancelchanges": "Cancel", + "button_discard": "Discard", + "text_condition1": "Send a notification when the metric is", + "text_condition2": "the threshold", + "text_condition3": "during the last", + "option_5min": "5 mins", + "option_10min": "10 mins", + "option_15min": "15 mins", + "option_60min": "60 mins", + "option_4hours": "4 hours", + "option_24hours": "24 hours", + "field_threshold": "Alert Threshold", + "option_allthetimes": "all the times", + "option_atleastonce": "at least once", + "option_onaverage": "on average", + "option_intotal": "in total", + "option_above": "above", + "option_below": "below", + "option_equal": "is equal to", + "option_notequal": "not equal to", + "button_query": "Query", + "button_formula": "Formula", + "tab_qb": "Query Builder", + "tab_promql": "PromQL", + "title_confirm": "Confirm", + "button_ok": "Yes", + "button_cancel": "No", + "field_promql_expr": "PromQL Expression", + "field_alert_name": "Alert Name", + "field_alert_desc": "Alert Description", + "field_labels": "Labels", + "field_severity": "Severity", + "option_critical": "Critical", + "option_error": "Error", + "option_warning": "Warning", + "option_info": "Info", + "user_guide_headline": "Steps to create an Alert", + "user_guide_qb_step1": "Step 1 - Define the metric", + "user_guide_qb_step1a": "Choose a metric which you want to create an alert on", + "user_guide_qb_step1b": "Filter it based on WHERE field or GROUPBY if needed", + "user_guide_qb_step1c": "Apply an aggregatiion function like COUNT, SUM, etc. or choose NOOP to plot the raw metric", + "user_guide_qb_step1d": "Create a formula based on Queries if needed", + "user_guide_qb_step2": "Step 2 - Define Alert Conditions", + "user_guide_qb_step2a": "Select the evaluation interval, threshold type and whether you want to alert above/below a value", + "user_guide_qb_step2b": "Enter the Alert threshold", + "user_guide_qb_step3": "Step 3 -Alert Configuration", + "user_guide_qb_step3a": "Set alert severity, name and descriptions", + "user_guide_qb_step3b": "Add tags to the alert in the Label field if needed", + "user_guide_pql_step1": "Step 1 - Define the metric", + "user_guide_pql_step1a": "Write a PromQL query for the metric", + "user_guide_pql_step1b": "Format the legends based on labels you want to highlight", + "user_guide_pql_step2": "Step 2 - Define Alert Conditions", + "user_guide_pql_step2a": "Select the threshold type and whether you want to alert above/below a value", + "user_guide_pql_step2b": "Enter the Alert threshold", + "user_guide_pql_step3": "Step 3 -Alert Configuration", + "user_guide_pql_step3a": "Set alert severity, name and descriptions", + "user_guide_pql_step3b": "Add tags to the alert in the Label field if needed", + "user_tooltip_more_help": "More details on how to create alerts" +} \ No newline at end of file diff --git a/frontend/public/locales/en/alerts.json b/frontend/public/locales/en/alerts.json new file mode 100644 index 0000000000..e67bd35273 --- /dev/null +++ b/frontend/public/locales/en/alerts.json @@ -0,0 +1,85 @@ +{ + "preview_chart_unexpected_error": "An unexpeced error occurred updating the chart, please check your query.", + "preview_chart_threshold_label": "Threshold", + "placeholder_label_key_pair": "Click here to enter a label (key value pairs)", + "button_yes": "Yes", + "button_no": "No", + "remove_label_confirm": "This action will remove all the labels. Do you want to proceed?", + "remove_label_success": "Labels cleared", + "alert_form_step1": "Step 1 - Define the metric", + "alert_form_step2": "Step 2 - Define Alert Conditions", + "alert_form_step3": "Step 3 - Alert Configuration", + "metric_query_max_limit": "Can not create query. You can create maximum of 5 queries", + "confirm_save_title": "Save Changes", + "confirm_save_content_part1": "Your alert built with", + "confirm_save_content_part2": "query will be saved. Press OK to confirm.", + "unexpected_error": "Sorry, an unexpected error occurred. Please contact your admin", + "rule_created": "Rule created successfully", + "rule_edited": "Rule edited successfully", + "expression_missing": "expression is missing in {{where}}", + "metricname_missing": "metric name is missing in {{where}}", + "condition_required": "at least one metric condition is required", + "alertname_required": "alert name is required", + "promql_required": "promql expression is required when query format is set to PromQL", + "button_savechanges": "Save Rule", + "button_createrule": "Create Rule", + "button_returntorules": "Return to rules", + "button_cancelchanges": "Cancel", + "button_discard": "Discard", + "text_condition1": "Send a notification when the metric is", + "text_condition2": "the threshold", + "text_condition3": "during the last", + "option_5min": "5 mins", + "option_10min": "10 mins", + "option_15min": "15 mins", + "option_60min": "60 mins", + "option_4hours": "4 hours", + "option_24hours": "24 hours", + "field_threshold": "Alert Threshold", + "option_allthetimes": "all the times", + "option_atleastonce": "at least once", + "option_onaverage": "on average", + "option_intotal": "in total", + "option_above": "above", + "option_below": "below", + "option_equal": "is equal to", + "option_notequal": "not equal to", + "button_query": "Query", + "button_formula": "Formula", + "tab_qb": "Query Builder", + "tab_promql": "PromQL", + "title_confirm": "Confirm", + "button_ok": "Yes", + "button_cancel": "No", + "field_promql_expr": "PromQL Expression", + "field_alert_name": "Alert Name", + "field_alert_desc": "Alert Description", + "field_labels": "Labels", + "field_severity": "Severity", + "option_critical": "Critical", + "option_error": "Error", + "option_warning": "Warning", + "option_info": "Info", + "user_guide_headline": "Steps to create an Alert", + "user_guide_qb_step1": "Step 1 - Define the metric", + "user_guide_qb_step1a": "Choose a metric which you want to create an alert on", + "user_guide_qb_step1b": "Filter it based on WHERE field or GROUPBY if needed", + "user_guide_qb_step1c": "Apply an aggregatiion function like COUNT, SUM, etc. or choose NOOP to plot the raw metric", + "user_guide_qb_step1d": "Create a formula based on Queries if needed", + "user_guide_qb_step2": "Step 2 - Define Alert Conditions", + "user_guide_qb_step2a": "Select the evaluation interval, threshold type and whether you want to alert above/below a value", + "user_guide_qb_step2b": "Enter the Alert threshold", + "user_guide_qb_step3": "Step 3 -Alert Configuration", + "user_guide_qb_step3a": "Set alert severity, name and descriptions", + "user_guide_qb_step3b": "Add tags to the alert in the Label field if needed", + "user_guide_pql_step1": "Step 1 - Define the metric", + "user_guide_pql_step1a": "Write a PromQL query for the metric", + "user_guide_pql_step1b": "Format the legends based on labels you want to highlight", + "user_guide_pql_step2": "Step 2 - Define Alert Conditions", + "user_guide_pql_step2a": "Select the threshold type and whether you want to alert above/below a value", + "user_guide_pql_step2b": "Enter the Alert threshold", + "user_guide_pql_step3": "Step 3 -Alert Configuration", + "user_guide_pql_step3a": "Set alert severity, name and descriptions", + "user_guide_pql_step3b": "Add tags to the alert in the Label field if needed", + "user_tooltip_more_help": "More details on how to create alerts" +} \ No newline at end of file diff --git a/frontend/public/locales/en/rules.json b/frontend/public/locales/en/rules.json new file mode 100644 index 0000000000..e67bd35273 --- /dev/null +++ b/frontend/public/locales/en/rules.json @@ -0,0 +1,85 @@ +{ + "preview_chart_unexpected_error": "An unexpeced error occurred updating the chart, please check your query.", + "preview_chart_threshold_label": "Threshold", + "placeholder_label_key_pair": "Click here to enter a label (key value pairs)", + "button_yes": "Yes", + "button_no": "No", + "remove_label_confirm": "This action will remove all the labels. Do you want to proceed?", + "remove_label_success": "Labels cleared", + "alert_form_step1": "Step 1 - Define the metric", + "alert_form_step2": "Step 2 - Define Alert Conditions", + "alert_form_step3": "Step 3 - Alert Configuration", + "metric_query_max_limit": "Can not create query. You can create maximum of 5 queries", + "confirm_save_title": "Save Changes", + "confirm_save_content_part1": "Your alert built with", + "confirm_save_content_part2": "query will be saved. Press OK to confirm.", + "unexpected_error": "Sorry, an unexpected error occurred. Please contact your admin", + "rule_created": "Rule created successfully", + "rule_edited": "Rule edited successfully", + "expression_missing": "expression is missing in {{where}}", + "metricname_missing": "metric name is missing in {{where}}", + "condition_required": "at least one metric condition is required", + "alertname_required": "alert name is required", + "promql_required": "promql expression is required when query format is set to PromQL", + "button_savechanges": "Save Rule", + "button_createrule": "Create Rule", + "button_returntorules": "Return to rules", + "button_cancelchanges": "Cancel", + "button_discard": "Discard", + "text_condition1": "Send a notification when the metric is", + "text_condition2": "the threshold", + "text_condition3": "during the last", + "option_5min": "5 mins", + "option_10min": "10 mins", + "option_15min": "15 mins", + "option_60min": "60 mins", + "option_4hours": "4 hours", + "option_24hours": "24 hours", + "field_threshold": "Alert Threshold", + "option_allthetimes": "all the times", + "option_atleastonce": "at least once", + "option_onaverage": "on average", + "option_intotal": "in total", + "option_above": "above", + "option_below": "below", + "option_equal": "is equal to", + "option_notequal": "not equal to", + "button_query": "Query", + "button_formula": "Formula", + "tab_qb": "Query Builder", + "tab_promql": "PromQL", + "title_confirm": "Confirm", + "button_ok": "Yes", + "button_cancel": "No", + "field_promql_expr": "PromQL Expression", + "field_alert_name": "Alert Name", + "field_alert_desc": "Alert Description", + "field_labels": "Labels", + "field_severity": "Severity", + "option_critical": "Critical", + "option_error": "Error", + "option_warning": "Warning", + "option_info": "Info", + "user_guide_headline": "Steps to create an Alert", + "user_guide_qb_step1": "Step 1 - Define the metric", + "user_guide_qb_step1a": "Choose a metric which you want to create an alert on", + "user_guide_qb_step1b": "Filter it based on WHERE field or GROUPBY if needed", + "user_guide_qb_step1c": "Apply an aggregatiion function like COUNT, SUM, etc. or choose NOOP to plot the raw metric", + "user_guide_qb_step1d": "Create a formula based on Queries if needed", + "user_guide_qb_step2": "Step 2 - Define Alert Conditions", + "user_guide_qb_step2a": "Select the evaluation interval, threshold type and whether you want to alert above/below a value", + "user_guide_qb_step2b": "Enter the Alert threshold", + "user_guide_qb_step3": "Step 3 -Alert Configuration", + "user_guide_qb_step3a": "Set alert severity, name and descriptions", + "user_guide_qb_step3b": "Add tags to the alert in the Label field if needed", + "user_guide_pql_step1": "Step 1 - Define the metric", + "user_guide_pql_step1a": "Write a PromQL query for the metric", + "user_guide_pql_step1b": "Format the legends based on labels you want to highlight", + "user_guide_pql_step2": "Step 2 - Define Alert Conditions", + "user_guide_pql_step2a": "Select the threshold type and whether you want to alert above/below a value", + "user_guide_pql_step2b": "Enter the Alert threshold", + "user_guide_pql_step3": "Step 3 -Alert Configuration", + "user_guide_pql_step3a": "Set alert severity, name and descriptions", + "user_guide_pql_step3b": "Add tags to the alert in the Label field if needed", + "user_tooltip_more_help": "More details on how to create alerts" +} \ No newline at end of file diff --git a/frontend/src/api/alerts/create.ts b/frontend/src/api/alerts/create.ts index 10dbff99b6..cad7917815 100644 --- a/frontend/src/api/alerts/create.ts +++ b/frontend/src/api/alerts/create.ts @@ -9,7 +9,7 @@ const create = async ( ): Promise | ErrorResponse> => { try { const response = await axios.post('/rules', { - data: props.query, + ...props.data, }); return { diff --git a/frontend/src/api/alerts/get.ts b/frontend/src/api/alerts/get.ts index aeddf67fd0..0437f8d1d8 100644 --- a/frontend/src/api/alerts/get.ts +++ b/frontend/src/api/alerts/get.ts @@ -14,7 +14,7 @@ const get = async ( statusCode: 200, error: null, message: response.data.status, - payload: response.data.data, + payload: response.data, }; } catch (error) { return ErrorResponseHandler(error as AxiosError); diff --git a/frontend/src/api/alerts/put.ts b/frontend/src/api/alerts/put.ts index 15d4c7c698..b8c34e96bd 100644 --- a/frontend/src/api/alerts/put.ts +++ b/frontend/src/api/alerts/put.ts @@ -2,14 +2,14 @@ import axios from 'api'; import { ErrorResponseHandler } from 'api/ErrorResponseHandler'; import { AxiosError } from 'axios'; import { ErrorResponse, SuccessResponse } from 'types/api'; -import { PayloadProps, Props } from 'types/api/alerts/put'; +import { PayloadProps, Props } from 'types/api/alerts/save'; const put = async ( props: Props, ): Promise | ErrorResponse> => { try { const response = await axios.put(`/rules/${props.id}`, { - data: props.data, + ...props.data, }); return { diff --git a/frontend/src/api/alerts/save.ts b/frontend/src/api/alerts/save.ts new file mode 100644 index 0000000000..229f0ae126 --- /dev/null +++ b/frontend/src/api/alerts/save.ts @@ -0,0 +1,17 @@ +import { ErrorResponse, SuccessResponse } from 'types/api'; +import { PayloadProps, Props } from 'types/api/alerts/save'; + +import create from './create'; +import put from './put'; + +const save = async ( + props: Props, +): Promise | ErrorResponse> => { + if (props.id && props.id > 0) { + return put({ ...props }); + } + + return create({ ...props }); +}; + +export default save; diff --git a/frontend/src/api/errors/getAll.ts b/frontend/src/api/errors/getAll.ts index dcd8aa8e73..7014e52a56 100644 --- a/frontend/src/api/errors/getAll.ts +++ b/frontend/src/api/errors/getAll.ts @@ -10,9 +10,8 @@ const getAll = async ( ): Promise | ErrorResponse> => { try { const response = await axios.get( - `/errors?${createQueryParams({ - start: props.start.toString(), - end: props.end.toString(), + `/listErrors?${createQueryParams({ + ...props, })}`, ); diff --git a/frontend/src/api/errors/getByErrorTypeAndService.ts b/frontend/src/api/errors/getByErrorTypeAndService.ts index 6a2c6964d9..c9a710fd72 100644 --- a/frontend/src/api/errors/getByErrorTypeAndService.ts +++ b/frontend/src/api/errors/getByErrorTypeAndService.ts @@ -10,11 +10,8 @@ const getByErrorType = async ( ): Promise | ErrorResponse> => { try { const response = await axios.get( - `/errorWithType?${createQueryParams({ - start: props.start.toString(), - end: props.end.toString(), - serviceName: props.serviceName, - errorType: props.errorType, + `/errorFromGroupID?${createQueryParams({ + ...props, })}`, ); diff --git a/frontend/src/api/errors/getById.ts b/frontend/src/api/errors/getById.ts index 3ab7c4aa60..ab0bae3f8a 100644 --- a/frontend/src/api/errors/getById.ts +++ b/frontend/src/api/errors/getById.ts @@ -3,17 +3,15 @@ import { ErrorResponseHandler } from 'api/ErrorResponseHandler'; import { AxiosError } from 'axios'; import createQueryParams from 'lib/createQueryParams'; import { ErrorResponse, SuccessResponse } from 'types/api'; -import { PayloadProps, Props } from 'types/api/errors/getById'; +import { PayloadProps, Props } from 'types/api/errors/getByErrorId'; const getById = async ( props: Props, ): Promise | ErrorResponse> => { try { const response = await axios.get( - `/errorWithId?${createQueryParams({ - start: props.start.toString(), - end: props.end.toString(), - errorId: props.errorId, + `/errorFromErrorID?${createQueryParams({ + ...props, })}`, ); diff --git a/frontend/src/api/errors/getErrorCounts.ts b/frontend/src/api/errors/getErrorCounts.ts new file mode 100644 index 0000000000..4992a6d391 --- /dev/null +++ b/frontend/src/api/errors/getErrorCounts.ts @@ -0,0 +1,29 @@ +import axios from 'api'; +import { ErrorResponseHandler } from 'api/ErrorResponseHandler'; +import { AxiosError } from 'axios'; +import createQueryParams from 'lib/createQueryParams'; +import { ErrorResponse, SuccessResponse } from 'types/api'; +import { PayloadProps, Props } from 'types/api/errors/getErrorCounts'; + +const getErrorCounts = async ( + props: Props, +): Promise | ErrorResponse> => { + try { + const response = await axios.get( + `/countErrors?${createQueryParams({ + ...props, + })}`, + ); + + return { + statusCode: 200, + error: null, + message: response.data.message, + payload: response.data, + }; + } catch (error) { + return ErrorResponseHandler(error as AxiosError); + } +}; + +export default getErrorCounts; diff --git a/frontend/src/api/errors/getNextPrevId.ts b/frontend/src/api/errors/getNextPrevId.ts new file mode 100644 index 0000000000..07798c548e --- /dev/null +++ b/frontend/src/api/errors/getNextPrevId.ts @@ -0,0 +1,29 @@ +import axios from 'api'; +import { ErrorResponseHandler } from 'api/ErrorResponseHandler'; +import { AxiosError } from 'axios'; +import createQueryParams from 'lib/createQueryParams'; +import { ErrorResponse, SuccessResponse } from 'types/api'; +import { PayloadProps, Props } from 'types/api/errors/getNextPrevId'; + +const getErrorCounts = async ( + props: Props, +): Promise | ErrorResponse> => { + try { + const response = await axios.get( + `/nextPrevErrorIDs?${createQueryParams({ + ...props, + })}`, + ); + + return { + statusCode: 200, + error: null, + message: response.data.message, + payload: response.data, + }; + } catch (error) { + return ErrorResponseHandler(error as AxiosError); + } +}; + +export default getErrorCounts; diff --git a/frontend/src/api/user/getVersion.ts b/frontend/src/api/user/getVersion.ts index a65ede2f0d..0f3e7f8e83 100644 --- a/frontend/src/api/user/getVersion.ts +++ b/frontend/src/api/user/getVersion.ts @@ -1,14 +1,15 @@ import axios from 'api'; import { ErrorResponseHandler } from 'api/ErrorResponseHandler'; import { AxiosError } from 'axios'; +import { getVersion } from 'constants/api'; import { ErrorResponse, SuccessResponse } from 'types/api'; import { PayloadProps } from 'types/api/user/getVersion'; -const getVersion = async (): Promise< +const getVersionApi = async (): Promise< SuccessResponse | ErrorResponse > => { try { - const response = await axios.get(`/version`); + const response = await axios.get(`/${getVersion}`); return { statusCode: 200, @@ -21,4 +22,4 @@ const getVersion = async (): Promise< } }; -export default getVersion; +export default getVersionApi; diff --git a/frontend/src/components/Graph/index.tsx b/frontend/src/components/Graph/index.tsx index 4bb76276c0..3df4de3caa 100644 --- a/frontend/src/components/Graph/index.tsx +++ b/frontend/src/components/Graph/index.tsx @@ -22,6 +22,7 @@ import { Tooltip, } from 'chart.js'; import * as chartjsAdapter from 'chartjs-adapter-date-fns'; +import annotationPlugin from 'chartjs-plugin-annotation'; import React, { useCallback, useEffect, useRef } from 'react'; import { useSelector } from 'react-redux'; import { AppState } from 'store/reducers'; @@ -50,6 +51,7 @@ Chart.register( SubTitle, BarController, BarElement, + annotationPlugin, ); function Graph({ @@ -62,6 +64,7 @@ function Graph({ name, yAxisUnit = 'short', forceReRender, + staticLine, }: GraphProps): JSX.Element { const { isDarkMode } = useSelector((state) => state.app); const chartRef = useRef(null); @@ -99,6 +102,30 @@ function Graph({ intersect: false, }, plugins: { + annotation: staticLine + ? { + annotations: [ + { + type: 'line', + yMin: staticLine.yMin, + yMax: staticLine.yMax, + borderColor: staticLine.borderColor, + borderWidth: staticLine.borderWidth, + label: { + content: staticLine.lineText, + enabled: true, + font: { + size: 10, + }, + borderWidth: 0, + position: 'start', + backgroundColor: 'transparent', + color: staticLine.textColor, + }, + }, + ], + } + : undefined, title: { display: title !== undefined, text: title, @@ -180,6 +207,7 @@ function Graph({ } }, }; + const chartHasData = hasData(data); const chartPlugins = []; @@ -205,6 +233,7 @@ function Graph({ name, yAxisUnit, onClickHandler, + staticLine, ]); useEffect(() => { @@ -229,6 +258,16 @@ interface GraphProps { name: string; yAxisUnit?: string; forceReRender?: boolean | null | number; + staticLine?: StaticLineProps | undefined; +} + +export interface StaticLineProps { + yMin: number | undefined; + yMax: number | undefined; + borderColor: string; + borderWidth: number; + lineText: string; + textColor: string; } export type GraphOnClickHandler = ( @@ -245,5 +284,6 @@ Graph.defaultProps = { onClickHandler: undefined, yAxisUnit: undefined, forceReRender: undefined, + staticLine: undefined, }; export default Graph; diff --git a/frontend/src/constants/api.ts b/frontend/src/constants/api.ts new file mode 100644 index 0000000000..8ebfe3b73c --- /dev/null +++ b/frontend/src/constants/api.ts @@ -0,0 +1,3 @@ +const getVersion = 'version'; + +export { getVersion }; diff --git a/frontend/src/container/AllError/index.tsx b/frontend/src/container/AllError/index.tsx index 51f47c1104..253af7dfe1 100644 --- a/frontend/src/container/AllError/index.tsx +++ b/frontend/src/container/AllError/index.tsx @@ -1,31 +1,85 @@ -import { notification, Table, Tooltip, Typography } from 'antd'; +import { notification, Table, TableProps, Tooltip, Typography } from 'antd'; import { ColumnsType } from 'antd/lib/table'; import getAll from 'api/errors/getAll'; +import getErrorCounts from 'api/errors/getErrorCounts'; import ROUTES from 'constants/routes'; import dayjs from 'dayjs'; -import React, { useEffect } from 'react'; +import createQueryParams from 'lib/createQueryParams'; +import history from 'lib/history'; +import React, { useEffect, useMemo } from 'react'; import { useTranslation } from 'react-i18next'; -import { useQuery } from 'react-query'; +import { useQueries } from 'react-query'; import { useSelector } from 'react-redux'; -import { Link } from 'react-router-dom'; +import { Link, useLocation } from 'react-router-dom'; import { AppState } from 'store/reducers'; -import { Exception } from 'types/api/errors/getAll'; +import { ErrorResponse, SuccessResponse } from 'types/api'; +import { Exception, PayloadProps } from 'types/api/errors/getAll'; import { GlobalReducer } from 'types/reducer/globalTime'; +import { + getDefaultOrder, + getNanoSeconds, + getOffSet, + getOrder, + getOrderParams, + getUpdatePageSize, + urlKey, +} from './utils'; + function AllErrors(): JSX.Element { - const { maxTime, minTime } = useSelector( + const { maxTime, minTime, loading } = useSelector( (state) => state.globalTime, ); + const { search, pathname } = useLocation(); + const params = useMemo(() => new URLSearchParams(search), [search]); const { t } = useTranslation(['common']); - const { isLoading, data } = useQuery(['getAllError', [maxTime, minTime]], { - queryFn: () => - getAll({ - end: maxTime, - start: minTime, - }), - }); + const updatedOrder = getOrder(params.get(urlKey.order)); + const getUpdatedOffset = getOffSet(params.get(urlKey.offset)); + const getUpdatedParams = getOrderParams(params.get(urlKey.orderParam)); + const getUpdatedPageSize = getUpdatePageSize(params.get(urlKey.pageSize)); + + const updatedPath = useMemo( + () => + `${pathname}?${createQueryParams({ + order: updatedOrder, + offset: getUpdatedOffset, + orderParam: getUpdatedParams, + pageSize: getUpdatedPageSize, + })}`, + [ + pathname, + updatedOrder, + getUpdatedOffset, + getUpdatedParams, + getUpdatedPageSize, + ], + ); + + const [{ isLoading, data }, errorCountResponse] = useQueries([ + { + queryKey: ['getAllErrors', updatedPath, maxTime, minTime], + queryFn: (): Promise | ErrorResponse> => + getAll({ + end: maxTime, + start: minTime, + order: updatedOrder, + limit: getUpdatedPageSize, + offset: getUpdatedOffset, + orderParam: getUpdatedParams, + }), + enabled: !loading, + }, + { + queryKey: ['getErrorCounts', maxTime, minTime], + queryFn: (): Promise> => + getErrorCounts({ + end: maxTime, + start: minTime, + }), + }, + ]); useEffect(() => { if (data?.error) { @@ -35,11 +89,9 @@ function AllErrors(): JSX.Element { } }, [data?.error, data?.payload, t]); - const getDateValue = (value: string): JSX.Element => { - return ( - {dayjs(value).format('DD/MM/YYYY HH:mm:ss A')} - ); - }; + const getDateValue = (value: string): JSX.Element => ( + {dayjs(value).format('DD/MM/YYYY HH:mm:ss A')} + ); const columns: ColumnsType = [ { @@ -49,14 +101,20 @@ function AllErrors(): JSX.Element { render: (value, record): JSX.Element => ( value}> {value} ), - sorter: (a, b): number => - a.exceptionType.charCodeAt(0) - b.exceptionType.charCodeAt(0), + sorter: true, + defaultSortOrder: getDefaultOrder( + getUpdatedParams, + updatedOrder, + 'exceptionType', + ), }, { title: 'Error Message', @@ -78,39 +136,86 @@ function AllErrors(): JSX.Element { title: 'Count', dataIndex: 'exceptionCount', key: 'exceptionCount', - sorter: (a, b): number => a.exceptionCount - b.exceptionCount, + sorter: true, + defaultSortOrder: getDefaultOrder( + getUpdatedParams, + updatedOrder, + 'exceptionCount', + ), }, { title: 'Last Seen', dataIndex: 'lastSeen', key: 'lastSeen', render: getDateValue, - sorter: (a, b): number => - dayjs(b.lastSeen).isBefore(dayjs(a.lastSeen)) === true ? 1 : 0, + sorter: true, + defaultSortOrder: getDefaultOrder( + getUpdatedParams, + updatedOrder, + 'lastSeen', + ), }, { title: 'First Seen', dataIndex: 'firstSeen', key: 'firstSeen', render: getDateValue, - sorter: (a, b): number => - dayjs(b.firstSeen).isBefore(dayjs(a.firstSeen)) === true ? 1 : 0, + sorter: true, + defaultSortOrder: getDefaultOrder( + getUpdatedParams, + updatedOrder, + 'firstSeen', + ), }, { title: 'Application', dataIndex: 'serviceName', key: 'serviceName', - sorter: (a, b): number => - a.serviceName.charCodeAt(0) - b.serviceName.charCodeAt(0), + sorter: true, + defaultSortOrder: getDefaultOrder( + getUpdatedParams, + updatedOrder, + 'serviceName', + ), }, ]; + const onChangeHandler: TableProps['onChange'] = ( + paginations, + _, + sorter, + ) => { + if (!Array.isArray(sorter)) { + const { pageSize = 0, current = 0 } = paginations; + const { columnKey = '', order } = sorter; + const updatedOrder = order === 'ascend' ? 'ascending' : 'descending'; + + history.replace( + `${pathname}?${createQueryParams({ + order: updatedOrder, + offset: (current - 1) * pageSize, + orderParam: columnKey, + pageSize, + })}`, + ); + } + }; + return ( ); } diff --git a/frontend/src/container/AllError/utils.test.ts b/frontend/src/container/AllError/utils.test.ts new file mode 100644 index 0000000000..344d318ebf --- /dev/null +++ b/frontend/src/container/AllError/utils.test.ts @@ -0,0 +1,109 @@ +import { Order, OrderBy } from 'types/api/errors/getAll'; + +import { + getDefaultOrder, + getLimit, + getOffSet, + getOrder, + getOrderParams, + getUpdatePageSize, + isOrder, + isOrderParams, +} from './utils'; + +describe('Error utils', () => { + test('Valid OrderBy Params', () => { + expect(isOrderParams('serviceName')).toBe(true); + expect(isOrderParams('exceptionCount')).toBe(true); + expect(isOrderParams('lastSeen')).toBe(true); + expect(isOrderParams('firstSeen')).toBe(true); + expect(isOrderParams('exceptionType')).toBe(true); + }); + + test('Invalid OrderBy Params', () => { + expect(isOrderParams('invalid')).toBe(false); + expect(isOrderParams(null)).toBe(false); + expect(isOrderParams('')).toBe(false); + }); + + test('Valid Order', () => { + expect(isOrder('ascending')).toBe(true); + expect(isOrder('descending')).toBe(true); + }); + + test('Invalid Order', () => { + expect(isOrder('invalid')).toBe(false); + expect(isOrder(null)).toBe(false); + expect(isOrder('')).toBe(false); + }); + + test('Default Order', () => { + const OrderBy: OrderBy[] = [ + 'exceptionCount', + 'exceptionType', + 'firstSeen', + 'lastSeen', + 'serviceName', + ]; + + const order: Order[] = ['ascending', 'descending']; + + const ascOrd = order[0]; + const desOrd = order[1]; + + OrderBy.forEach((order) => { + expect(getDefaultOrder(order, ascOrd, order)).toBe('ascend'); + expect(getDefaultOrder(order, desOrd, order)).toBe('descend'); + }); + }); + + test('Limit', () => { + expect(getLimit(null)).toBe(10); + expect(getLimit('')).toBe(10); + expect(getLimit('0')).toBe(0); + expect(getLimit('1')).toBe(1); + expect(getLimit('10')).toBe(10); + expect(getLimit('11')).toBe(11); + expect(getLimit('100')).toBe(100); + expect(getLimit('101')).toBe(101); + }); + + test('Update Page Size', () => { + expect(getUpdatePageSize(null)).toBe(10); + expect(getUpdatePageSize('')).toBe(10); + expect(getUpdatePageSize('0')).toBe(0); + expect(getUpdatePageSize('1')).toBe(1); + expect(getUpdatePageSize('10')).toBe(10); + expect(getUpdatePageSize('11')).toBe(11); + expect(getUpdatePageSize('100')).toBe(100); + expect(getUpdatePageSize('101')).toBe(101); + }); + + test('Order Params', () => { + expect(getOrderParams(null)).toBe('serviceName'); + expect(getOrderParams('')).toBe('serviceName'); + expect(getOrderParams('serviceName')).toBe('serviceName'); + expect(getOrderParams('exceptionCount')).toBe('exceptionCount'); + expect(getOrderParams('lastSeen')).toBe('lastSeen'); + expect(getOrderParams('firstSeen')).toBe('firstSeen'); + expect(getOrderParams('exceptionType')).toBe('exceptionType'); + }); + + test('OffSet', () => { + expect(getOffSet(null)).toBe(0); + expect(getOffSet('')).toBe(0); + expect(getOffSet('0')).toBe(0); + expect(getOffSet('1')).toBe(1); + expect(getOffSet('10')).toBe(10); + expect(getOffSet('11')).toBe(11); + expect(getOffSet('100')).toBe(100); + expect(getOffSet('101')).toBe(101); + }); + + test('Order', () => { + expect(getOrder(null)).toBe('ascending'); + expect(getOrder('')).toBe('ascending'); + expect(getOrder('ascending')).toBe('ascending'); + expect(getOrder('descending')).toBe('descending'); + }); +}); diff --git a/frontend/src/container/AllError/utils.ts b/frontend/src/container/AllError/utils.ts new file mode 100644 index 0000000000..239d404b1c --- /dev/null +++ b/frontend/src/container/AllError/utils.ts @@ -0,0 +1,89 @@ +import { SortOrder } from 'antd/lib/table/interface'; +import Timestamp from 'timestamp-nano'; +import { Order, OrderBy } from 'types/api/errors/getAll'; + +export const isOrder = (order: string | null): order is Order => + !!(order === 'ascending' || order === 'descending'); + +export const urlKey = { + order: 'order', + offset: 'offset', + orderParam: 'orderParam', + pageSize: 'pageSize', +}; + +export const isOrderParams = (orderBy: string | null): orderBy is OrderBy => { + return !!( + orderBy === 'serviceName' || + orderBy === 'exceptionCount' || + orderBy === 'lastSeen' || + orderBy === 'firstSeen' || + orderBy === 'exceptionType' + ); +}; + +export const getOrder = (order: string | null): Order => { + if (isOrder(order)) { + return order; + } + return 'ascending'; +}; + +export const getLimit = (limit: string | null): number => { + if (limit) { + return parseInt(limit, 10); + } + return 10; +}; + +export const getOffSet = (offset: string | null): number => { + if (offset && typeof offset === 'string') { + return parseInt(offset, 10); + } + return 0; +}; + +export const getOrderParams = (order: string | null): OrderBy => { + if (isOrderParams(order)) { + return order; + } + return 'serviceName'; +}; + +export const getDefaultOrder = ( + orderBy: OrderBy, + order: Order, + data: OrderBy, + // eslint-disable-next-line sonarjs/cognitive-complexity +): SortOrder | undefined => { + if (orderBy === 'exceptionType' && data === 'exceptionType') { + return order === 'ascending' ? 'ascend' : 'descend'; + } + if (orderBy === 'serviceName' && data === 'serviceName') { + return order === 'ascending' ? 'ascend' : 'descend'; + } + if (orderBy === 'exceptionCount' && data === 'exceptionCount') { + return order === 'ascending' ? 'ascend' : 'descend'; + } + if (orderBy === 'lastSeen' && data === 'lastSeen') { + return order === 'ascending' ? 'ascend' : 'descend'; + } + if (orderBy === 'firstSeen' && data === 'firstSeen') { + return order === 'ascending' ? 'ascend' : 'descend'; + } + return undefined; +}; + +export const getNanoSeconds = (date: string): string => { + return ( + Math.floor(new Date(date).getTime() / 1e3).toString() + + Timestamp.fromString(date).getNano().toString() + ); +}; + +export const getUpdatePageSize = (pageSize: string | null): number => { + if (pageSize) { + return parseInt(pageSize, 10); + } + return 10; +}; diff --git a/frontend/src/container/CreateAlertRule/index.tsx b/frontend/src/container/CreateAlertRule/index.tsx new file mode 100644 index 0000000000..f527fbbdf1 --- /dev/null +++ b/frontend/src/container/CreateAlertRule/index.tsx @@ -0,0 +1,22 @@ +import { Form } from 'antd'; +import FormAlertRules from 'container/FormAlertRules'; +import React from 'react'; +import { AlertDef } from 'types/api/alerts/def'; + +function CreateRules({ initialValue }: CreateRulesProps): JSX.Element { + const [formInstance] = Form.useForm(); + + return ( + + ); +} + +interface CreateRulesProps { + initialValue: AlertDef; +} + +export default CreateRules; diff --git a/frontend/src/container/EditRules/index.tsx b/frontend/src/container/EditRules/index.tsx index e228af0a10..cf4a02e717 100644 --- a/frontend/src/container/EditRules/index.tsx +++ b/frontend/src/container/EditRules/index.tsx @@ -1,102 +1,23 @@ -import { SaveFilled } from '@ant-design/icons'; -import { Button, notification } from 'antd'; -import put from 'api/alerts/put'; -import Editor from 'components/Editor'; -import ROUTES from 'constants/routes'; -import { State } from 'hooks/useFetch'; -import history from 'lib/history'; -import React, { useCallback, useState } from 'react'; -import { PayloadProps } from 'types/api/alerts/get'; -import { PayloadProps as PutPayloadProps } from 'types/api/alerts/put'; +import { Form } from 'antd'; +import FormAlertRules from 'container/FormAlertRules'; +import React from 'react'; +import { AlertDef } from 'types/api/alerts/def'; -import { ButtonContainer } from './styles'; - -function EditRules({ initialData, ruleId }: EditRulesProps): JSX.Element { - const [value, setEditorValue] = useState(initialData); - const [notifications, Element] = notification.useNotification(); - const [editButtonState, setEditButtonState] = useState>( - { - error: false, - errorMessage: '', - loading: false, - success: false, - payload: undefined, - }, - ); - - const onClickHandler = useCallback(async () => { - try { - setEditButtonState((state) => ({ - ...state, - loading: true, - })); - const response = await put({ - data: value, - id: parseInt(ruleId, 10), - }); - - if (response.statusCode === 200) { - setEditButtonState((state) => ({ - ...state, - loading: false, - payload: response.payload, - })); - - notifications.success({ - message: 'Success', - description: 'Congrats. The alert was Edited correctly.', - }); - - setTimeout(() => { - history.push(ROUTES.LIST_ALL_ALERT); - }, 2000); - } else { - setEditButtonState((state) => ({ - ...state, - loading: false, - errorMessage: response.error || 'Something went wrong', - error: true, - })); - - notifications.error({ - message: 'Error', - description: - response.error || - 'Oops! Some issue occured in editing the alert please try again or contact support@signoz.io', - }); - } - } catch (error) { - notifications.error({ - message: 'Error', - description: - 'Oops! Some issue occured in editing the alert please try again or contact support@signoz.io', - }); - } - }, [value, ruleId, notifications]); +function EditRules({ initialValue, ruleId }: EditRulesProps): JSX.Element { + const [formInstance] = Form.useForm(); return ( - <> - {Element} - - setEditorValue(value)} value={value} /> - - - - - + ); } interface EditRulesProps { - initialData: PayloadProps['data']; - ruleId: string; + initialValue: AlertDef; + ruleId: number; } export default EditRules; diff --git a/frontend/src/container/ErrorDetails/index.tsx b/frontend/src/container/ErrorDetails/index.tsx index a5f8efe756..d42d2e4a3e 100644 --- a/frontend/src/container/ErrorDetails/index.tsx +++ b/frontend/src/container/ErrorDetails/index.tsx @@ -1,25 +1,49 @@ import { Button, Divider, notification, Space, Table, Typography } from 'antd'; +import getNextPrevId from 'api/errors/getNextPrevId'; import Editor from 'components/Editor'; +import { getNanoSeconds } from 'container/AllError/utils'; import dayjs from 'dayjs'; import history from 'lib/history'; +import { urlKey } from 'pages/ErrorDetails/utils'; import React, { useMemo, useState } from 'react'; import { useTranslation } from 'react-i18next'; +import { useQuery } from 'react-query'; import { useLocation } from 'react-router-dom'; import { PayloadProps as GetByErrorTypeAndServicePayload } from 'types/api/errors/getByErrorTypeAndService'; -import { PayloadProps } from 'types/api/errors/getById'; import { DashedContainer, EditorContainer, EventContainer } from './styles'; function ErrorDetails(props: ErrorDetailsProps): JSX.Element { const { idPayload } = props; - const [isLoading, setLoading] = useState(false); const { t } = useTranslation(['errorDetails', 'common']); - const { search } = useLocation(); - const params = new URLSearchParams(search); - const queryErrorId = params.get('errorId'); - const serviceName = params.get('serviceName'); - const errorType = params.get('errorType'); + + const params = useMemo(() => new URLSearchParams(search), [search]); + + const errorId = params.get(urlKey.errorId); + const serviceName = params.get(urlKey.serviceName); + const errorType = params.get(urlKey.exceptionType); + const timestamp = params.get(urlKey.timestamp); + + const { data: nextPrevData, status: nextPrevStatus } = useQuery( + [ + idPayload.errorId, + idPayload.groupID, + idPayload.timestamp, + errorId, + serviceName, + errorType, + timestamp, + ], + { + queryFn: () => + getNextPrevId({ + errorID: errorId || idPayload.errorId, + groupID: idPayload.groupID, + timestamp: timestamp || getNanoSeconds(idPayload.timestamp), + }), + }, + ); const errorDetail = idPayload; @@ -48,34 +72,32 @@ function ErrorDetails(props: ErrorDetailsProps): JSX.Element { 'errorId', 'timestamp', 'exceptionMessage', - 'newerErrorId', - 'olderErrorId', + 'exceptionEscaped', ], [], ); - const onClickErrorIdHandler = async (id: string): Promise => { + const onClickErrorIdHandler = async ( + id: string, + timestamp: string, + ): Promise => { try { - setLoading(true); - if (id.length === 0) { notification.error({ message: 'Error Id cannot be empty', }); - setLoading(false); return; } - setLoading(false); - - history.push( - `${history.location.pathname}?errorId=${id}&serviceName=${serviceName}&errorType=${errorType}`, + history.replace( + `${history.location.pathname}?&groupId=${ + idPayload.groupID + }×tamp=${getNanoSeconds(timestamp)}&errorId=${id}`, ); } catch (error) { notification.error({ message: t('something_went_wrong'), }); - setLoading(false); } }; @@ -106,25 +128,25 @@ function ErrorDetails(props: ErrorDetailsProps): JSX.Element {
+ {t('user_guide_headline')} + + + + + + {queryType === EQueryType.QUERY_BUILDER && renderGuideForQB()} + {queryType === EQueryType.PROM && renderGuideForPQL()} + + ); +} + +interface UserGuideProps { + queryType: EQueryType; +} + +export default UserGuide; diff --git a/frontend/src/container/FormAlertRules/UserGuide/styles.ts b/frontend/src/container/FormAlertRules/UserGuide/styles.ts new file mode 100644 index 0000000000..d4292f32d8 --- /dev/null +++ b/frontend/src/container/FormAlertRules/UserGuide/styles.ts @@ -0,0 +1,17 @@ +import { Card, Typography } from 'antd'; +import styled from 'styled-components'; + +export const StyledMainContainer = styled(Card)``; + +export const StyledTopic = styled(Typography.Paragraph)` + font-weight: 600; +`; + +export const StyledList = styled.ul` + padding-left: 18px; +`; + +export const StyledListItem = styled.li` + font-style: italic; + padding-bottom: 0.5rem; +`; diff --git a/frontend/src/container/FormAlertRules/index.tsx b/frontend/src/container/FormAlertRules/index.tsx new file mode 100644 index 0000000000..38fcaad04d --- /dev/null +++ b/frontend/src/container/FormAlertRules/index.tsx @@ -0,0 +1,381 @@ +import { ExclamationCircleOutlined, SaveOutlined } from '@ant-design/icons'; +import { FormInstance, Modal, notification, Typography } from 'antd'; +import saveAlertApi from 'api/alerts/save'; +import ROUTES from 'constants/routes'; +import QueryTypeTag from 'container/NewWidget/LeftContainer/QueryTypeTag'; +import PlotTag from 'container/NewWidget/LeftContainer/WidgetGraph/PlotTag'; +import history from 'lib/history'; +import React, { useCallback, useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useQueryClient } from 'react-query'; +import { + IFormulaQueries, + IMetricQueries, + IPromQueries, +} from 'types/api/alerts/compositeQuery'; +import { + AlertDef, + defaultEvalWindow, + defaultMatchType, +} from 'types/api/alerts/def'; +import { Query as StagedQuery } from 'types/api/dashboard/getAll'; +import { EQueryType } from 'types/common/dashboard'; + +import BasicInfo from './BasicInfo'; +import ChartPreview from './ChartPreview'; +import QuerySection from './QuerySection'; +import RuleOptions from './RuleOptions'; +import { + ActionButton, + ButtonContainer, + MainFormContainer, + PanelContainer, + StyledLeftContainer, + StyledRightContainer, +} from './styles'; +import useDebounce from './useDebounce'; +import UserGuide from './UserGuide'; +import { + prepareBuilderQueries, + prepareStagedQuery, + toChartInterval, + toFormulaQueries, + toMetricQueries, +} from './utils'; + +function FormAlertRules({ + formInstance, + initialValue, + ruleId, +}: FormAlertRuleProps): JSX.Element { + // init namespace for translations + const { t } = useTranslation('alerts'); + + // use query client + const ruleCache = useQueryClient(); + + const [loading, setLoading] = useState(false); + + // alertDef holds the form values to be posted + const [alertDef, setAlertDef] = useState(initialValue); + + // initQuery contains initial query when component was mounted + const initQuery = initialValue?.condition?.compositeMetricQuery; + + const [queryCategory, setQueryCategory] = useState( + initQuery?.queryType, + ); + + // local state to handle metric queries + const [metricQueries, setMetricQueries] = useState( + toMetricQueries(initQuery?.builderQueries), + ); + + // local state to handle formula queries + const [formulaQueries, setFormulaQueries] = useState( + toFormulaQueries(initQuery?.builderQueries), + ); + + // local state to handle promql queries + const [promQueries, setPromQueries] = useState({ + ...initQuery?.promQueries, + }); + + // staged query is used to display chart preview + const [stagedQuery, setStagedQuery] = useState(); + const debouncedStagedQuery = useDebounce(stagedQuery, 500); + + // this use effect initiates staged query and + // other queries based on server data. + // useful when fetching of initial values (from api) + // is delayed + useEffect(() => { + const initQuery = initialValue?.condition?.compositeMetricQuery; + const typ = initQuery?.queryType; + + // extract metric query from builderQueries + const mq = toMetricQueries(initQuery?.builderQueries); + + // extract formula query from builderQueries + const fq = toFormulaQueries(initQuery?.builderQueries); + + // prepare staged query + const sq = prepareStagedQuery(typ, mq, fq, initQuery?.promQueries); + const pq = initQuery?.promQueries; + + setQueryCategory(typ); + setMetricQueries(mq); + setFormulaQueries(fq); + setPromQueries(pq); + setStagedQuery(sq); + setAlertDef(initialValue); + }, [initialValue]); + + // this useEffect updates staging query when + // any of its sub-parameters changes + useEffect(() => { + // prepare staged query + const sq: StagedQuery = prepareStagedQuery( + queryCategory, + metricQueries, + formulaQueries, + promQueries, + ); + setStagedQuery(sq); + }, [queryCategory, metricQueries, formulaQueries, promQueries]); + + const onCancelHandler = useCallback(() => { + history.replace(ROUTES.LIST_ALL_ALERT); + }, []); + + // onQueryCategoryChange handles changes to query category + // in state as well as sets additional defaults + const onQueryCategoryChange = (val: EQueryType): void => { + setQueryCategory(val); + if (val === EQueryType.PROM) { + setAlertDef({ + ...alertDef, + condition: { + ...alertDef.condition, + matchType: defaultMatchType, + }, + evalWindow: defaultEvalWindow, + }); + } + }; + + const isFormValid = useCallback((): boolean => { + let retval = true; + + if (!alertDef.alert || alertDef.alert === '') { + notification.error({ + message: 'Error', + description: t('alertname_required'), + }); + return false; + } + + if ( + queryCategory === EQueryType.PROM && + (!promQueries || Object.keys(promQueries).length === 0) + ) { + notification.error({ + message: 'Error', + description: t('promql_required'), + }); + return false; + } + + if ( + (queryCategory === EQueryType.QUERY_BUILDER && !metricQueries) || + Object.keys(metricQueries).length === 0 + ) { + notification.error({ + message: 'Error', + description: t('condition_required'), + }); + return false; + } + + Object.keys(metricQueries).forEach((key) => { + if (metricQueries[key].metricName === '') { + retval = false; + notification.error({ + message: 'Error', + description: t('metricname_missing', { where: metricQueries[key].name }), + }); + } + }); + + Object.keys(formulaQueries).forEach((key) => { + if (formulaQueries[key].expression === '') { + retval = false; + notification.error({ + message: 'Error', + description: t('expression_missing', formulaQueries[key].name), + }); + } + }); + + return retval; + }, [t, alertDef, queryCategory, metricQueries, formulaQueries, promQueries]); + + const saveRule = useCallback(async () => { + if (!isFormValid()) { + return; + } + + const postableAlert: AlertDef = { + ...alertDef, + source: window?.location.toString(), + ruleType: + queryCategory === EQueryType.PROM ? 'promql_rule' : 'threshold_rule', + condition: { + ...alertDef.condition, + compositeMetricQuery: { + builderQueries: prepareBuilderQueries(metricQueries, formulaQueries), + promQueries, + queryType: queryCategory, + }, + }, + }; + + setLoading(true); + try { + const apiReq = + ruleId && ruleId > 0 + ? { data: postableAlert, id: ruleId } + : { data: postableAlert }; + + const response = await saveAlertApi(apiReq); + + if (response.statusCode === 200) { + notification.success({ + message: 'Success', + description: + !ruleId || ruleId === 0 ? t('rule_created') : t('rule_edited'), + }); + console.log('invalidting cache'); + // invalidate rule in cache + ruleCache.invalidateQueries(['ruleId', ruleId]); + + setTimeout(() => { + history.replace(ROUTES.LIST_ALL_ALERT); + }, 2000); + } else { + notification.error({ + message: 'Error', + description: response.error || t('unexpected_error'), + }); + } + } catch (e) { + console.log('save alert api failed:', e); + notification.error({ + message: 'Error', + description: t('unexpected_error'), + }); + } + setLoading(false); + }, [ + t, + isFormValid, + queryCategory, + ruleId, + alertDef, + metricQueries, + formulaQueries, + promQueries, + ruleCache, + ]); + + const onSaveHandler = useCallback(async () => { + const content = ( + + {' '} + {t('confirm_save_content_part1')} {' '} + {t('confirm_save_content_part2')} + + ); + Modal.confirm({ + icon: , + title: t('confirm_save_title'), + centered: true, + content, + onOk() { + saveRule(); + }, + }); + }, [t, saveRule, queryCategory]); + + const renderBasicInfo = (): JSX.Element => ( + + ); + + const renderQBChartPreview = (): JSX.Element => { + return ( + } + name="" + threshold={alertDef.condition?.target} + query={debouncedStagedQuery} + selectedInterval={toChartInterval(alertDef.evalWindow)} + /> + ); + }; + + const renderPromChartPreview = (): JSX.Element => { + return ( + } + name="Chart Preview" + threshold={alertDef.condition?.target} + query={debouncedStagedQuery} + /> + ); + }; + + return ( + <> + {Element} + + + + {queryCategory === EQueryType.QUERY_BUILDER && renderQBChartPreview()} + {queryCategory === EQueryType.PROM && renderPromChartPreview()} + + + + + {renderBasicInfo()} + + } + > + {ruleId > 0 ? t('button_savechanges') : t('button_createrule')} + + + {ruleId === 0 && t('button_cancelchanges')} + {ruleId > 0 && t('button_discard')} + + + + + + + + + + ); +} + +interface FormAlertRuleProps { + formInstance: FormInstance; + initialValue: AlertDef; + ruleId: number; +} + +export default FormAlertRules; diff --git a/frontend/src/container/FormAlertRules/labels/Labels.machine.ts b/frontend/src/container/FormAlertRules/labels/Labels.machine.ts new file mode 100644 index 0000000000..812a498c65 --- /dev/null +++ b/frontend/src/container/FormAlertRules/labels/Labels.machine.ts @@ -0,0 +1,49 @@ +import { createMachine } from 'xstate'; + +export const ResourceAttributesFilterMachine = + /** @xstate-layout N4IgpgJg5mDOIC5QBECGsAWAjA9qgThAAQDKYBAxhkQIIB2xAYgJYA2ALmPgHQAqqUANJgAngGIAcgFEAGr0SgADjljN2zHHQUgAHogAcAFgAM3AOz6ATAEYAzJdsA2Y4cOWAnABoQIxAFpDR2tuQ319AFYTcKdbFycAX3jvNExcAmIySmp6JjZOHn4hUTFNACFWAFd8bWVVdU1tPQQzY1MXY2tDdzNHM3dHd0NvXwR7biMTa313S0i+63DE5PRsPEJScnwqWgYiFg4uPgFhcQAlKRIpeSQQWrUNLRumx3Czbg8TR0sbS31jfUcw38fW47gBHmm4XCVms3SWIBSq3SGyyO1yBx4AHlFFxUOwcPhJLJrkoVPcGk9ENYFuF3i5YR0wtEHECEAEgiEmV8zH1DLYzHZ4Yi0utMltsrt9vluNjcfjCWVKtUbnd6o9QE1rMYBtxbGFvsZ3NrZj1WdYOfotUZLX0XEFHEKViKMpttjk9nlDrL8HiCWJzpcSbcyWrGoh3NCQj0zK53P1ph1WeFLLqnJZ2s5vmZLA6kginWsXaj3VLDoUAGqoSpgEp0cpVGohh5hhDWDy0sz8zruakzamWVm-Qyg362V5-AZOayO1KFlHitEejFHKCV6v+i5XRt1ZuU1s52zjNOOaZfdOWIY+RDZ0Hc6ZmKEXqyLPPCudit2Sz08ACSEFYNbSHI27kuquiIOEjiONwjJgrM3RWJYZisgEIJgnYPTmuEdi2OaiR5nQOAQHA2hvsiH4Sui0qFCcIGhnuLSmP0YJuJ2xjJsmKELG8XZTK0tjdHG06vgW5GupRS7St6vrKqSO4UhqVL8TBWp8o4eqdl0A5Xmy3G6gK56-B4uERDOSKiuJi6lgUAhrhUYB0buimtrEKZBDYrxaS0OZca8+ltheybOI4hivGZzrzp+VGHH+AGOQp4EIHy+ghNYnawtG4TsbYvk8QKfHGAJfQ9uF76WSW37xWBTSGJ0qXpd0vRZdEKGPqC2YeO2-zfO4+HxEAA */ + createMachine({ + tsTypes: {} as import('./Labels.machine.typegen').Typegen0, + initial: 'Idle', + states: { + LabelKey: { + on: { + NEXT: { + actions: 'onSelectLabelValue', + target: 'LabelValue', + }, + onBlur: { + actions: 'onSelectLabelValue', + target: 'LabelValue', + }, + RESET: { + target: 'Idle', + }, + }, + }, + LabelValue: { + on: { + NEXT: { + actions: ['onValidateQuery'], + }, + onBlur: { + actions: ['onValidateQuery'], + // target: 'Idle', + }, + RESET: { + target: 'Idle', + }, + }, + }, + Idle: { + on: { + NEXT: { + actions: 'onSelectLabelKey', + description: 'Enter a label key', + target: 'LabelKey', + }, + }, + }, + }, + id: 'Label Key Values', + }); diff --git a/frontend/src/container/FormAlertRules/labels/Labels.machine.typegen.ts b/frontend/src/container/FormAlertRules/labels/Labels.machine.typegen.ts new file mode 100644 index 0000000000..f31469f659 --- /dev/null +++ b/frontend/src/container/FormAlertRules/labels/Labels.machine.typegen.ts @@ -0,0 +1,25 @@ +// This file was automatically generated. Edits will be overwritten + +export interface Typegen0 { + '@@xstate/typegen': true; + eventsCausingActions: { + onSelectLabelValue: 'NEXT' | 'onBlur'; + onValidateQuery: 'NEXT' | 'onBlur'; + onSelectLabelKey: 'NEXT'; + }; + internalEvents: { + 'xstate.init': { type: 'xstate.init' }; + }; + invokeSrcNameMap: {}; + missingImplementations: { + actions: 'onSelectLabelValue' | 'onValidateQuery' | 'onSelectLabelKey'; + services: never; + guards: never; + delays: never; + }; + eventsCausingServices: {}; + eventsCausingGuards: {}; + eventsCausingDelays: {}; + matchesStates: 'LabelKey' | 'LabelValue' | 'Idle'; + tags: never; +} diff --git a/frontend/src/container/FormAlertRules/labels/QueryChip.tsx b/frontend/src/container/FormAlertRules/labels/QueryChip.tsx new file mode 100644 index 0000000000..47e4c956ff --- /dev/null +++ b/frontend/src/container/FormAlertRules/labels/QueryChip.tsx @@ -0,0 +1,26 @@ +import React from 'react'; + +import { QueryChipContainer, QueryChipItem } from './styles'; +import { ILabelRecord } from './types'; + +interface QueryChipProps { + queryData: ILabelRecord; + onRemove: (id: string) => void; +} + +export default function QueryChip({ + queryData, + onRemove, +}: QueryChipProps): JSX.Element { + const { key, value } = queryData; + return ( + + onRemove(key)} + > + {key}: {value} + + + ); +} diff --git a/frontend/src/container/FormAlertRules/labels/index.tsx b/frontend/src/container/FormAlertRules/labels/index.tsx new file mode 100644 index 0000000000..98fd564cbd --- /dev/null +++ b/frontend/src/container/FormAlertRules/labels/index.tsx @@ -0,0 +1,164 @@ +import { + CloseCircleFilled, + ExclamationCircleOutlined, +} from '@ant-design/icons'; +import { useMachine } from '@xstate/react'; +import { Button, Input, message, Modal } from 'antd'; +import { map } from 'lodash-es'; +import React, { useCallback, useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useSelector } from 'react-redux'; +import { AppState } from 'store/reducers'; +import { Labels } from 'types/api/alerts/def'; +import AppReducer from 'types/reducer/app'; +import { v4 as uuid } from 'uuid'; + +import { ResourceAttributesFilterMachine } from './Labels.machine'; +import QueryChip from './QueryChip'; +import { QueryChipItem, SearchContainer } from './styles'; +import { ILabelRecord } from './types'; +import { createQuery, flattenLabels, prepareLabels } from './utils'; + +interface LabelSelectProps { + onSetLabels: (q: Labels) => void; + initialValues: Labels | undefined; +} + +function LabelSelect({ + onSetLabels, + initialValues, +}: LabelSelectProps): JSX.Element | null { + const { t } = useTranslation('alerts'); + const { isDarkMode } = useSelector((state) => state.app); + const [currentVal, setCurrentVal] = useState(''); + const [staging, setStaging] = useState([]); + const [queries, setQueries] = useState( + initialValues ? flattenLabels(initialValues) : [], + ); + + const dispatchChanges = (updatedRecs: ILabelRecord[]): void => { + onSetLabels(prepareLabels(updatedRecs, initialValues)); + setQueries(updatedRecs); + }; + + const [state, send] = useMachine(ResourceAttributesFilterMachine, { + actions: { + onSelectLabelKey: () => {}, + onSelectLabelValue: () => { + if (currentVal !== '') { + setStaging((prevState) => [...prevState, currentVal]); + } else { + return; + } + setCurrentVal(''); + }, + onValidateQuery: (): void => { + if (currentVal === '') { + return; + } + + const generatedQuery = createQuery([...staging, currentVal]); + + if (generatedQuery) { + dispatchChanges([...queries, generatedQuery]); + setStaging([]); + setCurrentVal(''); + send('RESET'); + } + }, + }, + }); + + const handleFocus = (): void => { + if (state.value === 'Idle') { + send('NEXT'); + } + }; + + const handleBlur = useCallback((): void => { + if (staging.length === 1 && staging[0] !== undefined) { + send('onBlur'); + } + }, [send, staging]); + + useEffect(() => { + handleBlur(); + }, [handleBlur]); + + const handleChange = (e: React.ChangeEvent): void => { + setCurrentVal(e.target?.value); + }; + + const handleClose = (key: string): void => { + dispatchChanges(queries.filter((queryData) => queryData.key !== key)); + }; + + const handleClearAll = (): void => { + Modal.confirm({ + title: 'Confirm', + icon: , + content: t('remove_label_confirm'), + onOk() { + send('RESET'); + dispatchChanges([]); + setStaging([]); + message.success(t('remove_label_success')); + }, + okText: t('button_yes'), + cancelText: t('button_no'), + }); + }; + const renderPlaceholder = useCallback((): string => { + if (state.value === 'LabelKey') return 'Enter a label key then press ENTER.'; + if (state.value === 'LabelValue') + return `Enter a value for label key(${staging[0]}) then press ENTER.`; + return t('placeholder_label_key_pair'); + }, [t, state, staging]); + return ( + +
+ {queries.length > 0 && + map( + queries, + (query): JSX.Element => { + return ( + + ); + }, + )} +
+
+ {map(staging, (item) => { + return {item}; + })} +
+ +
+ { + if (e.key === 'Enter' || e.code === 'Enter') { + send('NEXT'); + } + }} + bordered={false} + value={currentVal as never} + style={{ flex: 1 }} + onFocus={handleFocus} + onBlur={handleBlur} + /> + + {queries.length || staging.length || currentVal ? ( +
+
+ ); +} + +export default LabelSelect; diff --git a/frontend/src/container/FormAlertRules/labels/styles.ts b/frontend/src/container/FormAlertRules/labels/styles.ts new file mode 100644 index 0000000000..04d6871315 --- /dev/null +++ b/frontend/src/container/FormAlertRules/labels/styles.ts @@ -0,0 +1,35 @@ +import { grey } from '@ant-design/colors'; +import { Tag } from 'antd'; +import styled from 'styled-components'; + +interface SearchContainerProps { + isDarkMode: boolean; + disabled: boolean; +} + +export const SearchContainer = styled.div` + width: 70%; + border-radisu: 4px; + background: ${({ isDarkMode }): string => (isDarkMode ? '#000' : '#fff')}; + flex: 1; + display: flex; + flex-direction: column; + padding: 0.2rem; + border: 1px solid #ccc5; + ${({ disabled }): string => (disabled ? `cursor: not-allowed;` : '')} +`; + +export const QueryChipContainer = styled.span` + display: flex; + align-items: center; + margin-right: 0.5rem; + &:hover { + & > * { + background: ${grey.primary}44; + } + } +`; + +export const QueryChipItem = styled(Tag)` + margin-right: 0.1rem; +`; diff --git a/frontend/src/container/FormAlertRules/labels/types.ts b/frontend/src/container/FormAlertRules/labels/types.ts new file mode 100644 index 0000000000..b10fc3fded --- /dev/null +++ b/frontend/src/container/FormAlertRules/labels/types.ts @@ -0,0 +1,9 @@ +export interface ILabelRecord { + key: string; + value: string; +} + +export interface IOption { + label: string; + value: string; +} diff --git a/frontend/src/container/FormAlertRules/labels/utils.ts b/frontend/src/container/FormAlertRules/labels/utils.ts new file mode 100644 index 0000000000..1a2943f3ee --- /dev/null +++ b/frontend/src/container/FormAlertRules/labels/utils.ts @@ -0,0 +1,54 @@ +import { Labels } from 'types/api/alerts/def'; + +import { ILabelRecord } from './types'; + +const hiddenLabels = ['severity', 'description']; + +export const createQuery = ( + selectedItems: Array = [], +): ILabelRecord | null => { + if (selectedItems.length === 2) { + return { + key: selectedItems[0] as string, + value: selectedItems[1] as string, + }; + } + return null; +}; + +export const flattenLabels = (labels: Labels): ILabelRecord[] => { + const recs: ILabelRecord[] = []; + + Object.keys(labels).forEach((key) => { + if (!hiddenLabels.includes(key)) { + recs.push({ + key, + value: labels[key], + }); + } + }); + + return recs; +}; + +export const prepareLabels = ( + recs: ILabelRecord[], + alertLabels: Labels | undefined, +): Labels => { + const labels: Labels = {}; + + recs.forEach((rec) => { + if (!hiddenLabels.includes(rec.key)) { + labels[rec.key] = rec.value; + } + }); + if (alertLabels) { + Object.keys(alertLabels).forEach((key) => { + if (hiddenLabels.includes(key)) { + labels[key] = alertLabels[key]; + } + }); + } + + return labels; +}; diff --git a/frontend/src/container/FormAlertRules/styles.ts b/frontend/src/container/FormAlertRules/styles.ts new file mode 100644 index 0000000000..4ec8dcafbd --- /dev/null +++ b/frontend/src/container/FormAlertRules/styles.ts @@ -0,0 +1,101 @@ +import { Button, Card, Col, Form, Input, InputNumber, Row, Select } from 'antd'; +import TextArea from 'antd/lib/input/TextArea'; +import styled from 'styled-components'; + +export const PanelContainer = styled(Row)` + flex-wrap: nowrap; +`; + +export const StyledRightContainer = styled(Col)` + &&& { + } +`; + +export const StyledLeftContainer = styled(Col)` + &&& { + margin-right: 1rem; + } +`; + +export const MainFormContainer = styled(Form)``; + +export const ButtonContainer = styled.div` + &&& { + display: flex; + justify-content: flex-start; + align-items: center; + margin-top: 1rem; + margin-bottom: 3rem; + } +`; + +export const ActionButton = styled(Button)` + margin-right: 1rem; +`; + +export const QueryButton = styled(Button)` + &&& { + display: flex; + align-items: center; + margin-right: 1rem; + } +`; + +export const QueryContainer = styled(Card)` + &&& { + margin-top: 1rem; + min-height: 23.5%; + } +`; + +export const Container = styled.div` + margin-top: 1rem; + display: flex; + flex-direction: column; +`; + +export const StepHeading = styled.p` + margin-top: 1rem; + font-weight: bold; +`; + +export const InlineSelect = styled(Select)` + display: inline-block; + width: 10% !important; + margin-left: 0.2em; + margin-right: 0.2em; +`; + +export const SeveritySelect = styled(Select)` + width: 15% !important; +`; + +export const InputSmall = styled(Input)` + width: 40% !important; +`; + +export const FormContainer = styled(Card)` + padding: 2em; + margin-top: 1rem; + display: flex; + flex-direction: column; + border-radius: 4px; +`; + +export const ThresholdInput = styled(InputNumber)` + & > div { + display: flex; + align-items: center; + & > .ant-input-number-group-addon { + width: 130px; + } + & > .ant-input-number { + width: 50%; + margin-left: 1em; + } + } +`; + +export const TextareaMedium = styled(TextArea)` + width: 70%; +`; diff --git a/frontend/src/container/FormAlertRules/useDebounce.js b/frontend/src/container/FormAlertRules/useDebounce.js new file mode 100644 index 0000000000..e430f55d63 --- /dev/null +++ b/frontend/src/container/FormAlertRules/useDebounce.js @@ -0,0 +1,31 @@ +/* eslint-disable */ +// @ts-ignore +// @ts-nocheck + +import { useEffect, useState } from 'react'; + +// see https://github.com/tannerlinsley/react-query/issues/293 +// see https://usehooks.com/useDebounce/ +export default function useDebounce(value, delay) { + // State and setters for debounced value + const [debouncedValue, setDebouncedValue] = useState(value); + + useEffect( + () => { + // Update debounced value after delay + const handler = setTimeout(() => { + setDebouncedValue(value); + }, delay); + + // Cancel the timeout if value changes (also on delay change or unmount) + // This is how we prevent debounced value from updating if value is changed ... + // .. within the delay period. Timeout gets cleared and restarted. + return () => { + clearTimeout(handler); + }; + }, + [value, delay] // Only re-call effect if value or delay changes + ); + + return debouncedValue; +} diff --git a/frontend/src/container/FormAlertRules/utils.ts b/frontend/src/container/FormAlertRules/utils.ts new file mode 100644 index 0000000000..ea40ce692b --- /dev/null +++ b/frontend/src/container/FormAlertRules/utils.ts @@ -0,0 +1,136 @@ +import { Time } from 'container/TopNav/DateTimeSelection/config'; +import { + IBuilderQueries, + IFormulaQueries, + IFormulaQuery, + IMetricQueries, + IMetricQuery, + IPromQueries, + IPromQuery, +} from 'types/api/alerts/compositeQuery'; +import { + IMetricsBuilderQuery, + Query as IStagedQuery, +} from 'types/api/dashboard/getAll'; +import { EQueryType } from 'types/common/dashboard'; + +export const toFormulaQueries = (b: IBuilderQueries): IFormulaQueries => { + const f: IFormulaQueries = {}; + if (!b) return f; + Object.keys(b).forEach((key) => { + if (key === 'F1') { + f[key] = b[key] as IFormulaQuery; + } + }); + + return f; +}; + +export const toMetricQueries = (b: IBuilderQueries): IMetricQueries => { + const m: IMetricQueries = {}; + if (!b) return m; + Object.keys(b).forEach((key) => { + if (key !== 'F1') { + m[key] = b[key] as IMetricQuery; + } + }); + + return m; +}; + +export const toIMetricsBuilderQuery = ( + q: IMetricQuery, +): IMetricsBuilderQuery => { + return { + name: q.name, + metricName: q.metricName, + tagFilters: q.tagFilters, + groupBy: q.groupBy, + aggregateOperator: q.aggregateOperator, + disabled: q.disabled, + legend: q.legend, + }; +}; + +export const prepareBuilderQueries = ( + m: IMetricQueries, + f: IFormulaQueries, +): IBuilderQueries => { + if (!m) return {}; + const b: IBuilderQueries = { + ...m, + }; + + Object.keys(f).forEach((key) => { + b[key] = { + ...f[key], + aggregateOperator: undefined, + metricName: '', + }; + }); + return b; +}; + +export const prepareStagedQuery = ( + t: EQueryType, + m: IMetricQueries, + f: IFormulaQueries, + p: IPromQueries, +): IStagedQuery => { + const qbList: IMetricQuery[] = []; + const formulaList: IFormulaQuery[] = []; + const promList: IPromQuery[] = []; + + // convert map[string]IMetricQuery to IMetricQuery[] + if (m) { + Object.keys(m).forEach((key) => { + qbList.push(m[key]); + }); + } + + // convert map[string]IFormulaQuery to IFormulaQuery[] + if (f) { + Object.keys(f).forEach((key) => { + formulaList.push(f[key]); + }); + } + + // convert map[string]IPromQuery to IPromQuery[] + if (p) { + Object.keys(p).forEach((key) => { + promList.push({ ...p[key], name: key }); + }); + } + + return { + queryType: t, + promQL: promList, + metricsBuilder: { + formulas: formulaList, + queryBuilder: qbList, + }, + clickHouse: [], + }; +}; + +// toChartInterval converts eval window to chart selection time interval +export const toChartInterval = (evalWindow: string | undefined): Time => { + switch (evalWindow) { + case '5m0s': + return '5min'; + case '10m0s': + return '10min'; + case '15m0s': + return '15min'; + case '30m0s': + return '30min'; + case '60m0s': + return '30min'; + case '4h0m0s': + return '4hr'; + case '24h0m0s': + return '1day'; + default: + return '5min'; + } +}; diff --git a/frontend/src/container/GantChart/SpanName/index.tsx b/frontend/src/container/GantChart/SpanName/index.tsx index 47d58c3e5c..7f536624b9 100644 --- a/frontend/src/container/GantChart/SpanName/index.tsx +++ b/frontend/src/container/GantChart/SpanName/index.tsx @@ -10,7 +10,7 @@ function SpanNameComponent({ {name} - {serviceName} + {serviceName} ); diff --git a/frontend/src/container/GantChart/SpanName/styles.ts b/frontend/src/container/GantChart/SpanName/styles.ts index 642e28f639..abd41dc54e 100644 --- a/frontend/src/container/GantChart/SpanName/styles.ts +++ b/frontend/src/container/GantChart/SpanName/styles.ts @@ -9,7 +9,7 @@ export const Span = styled(Typography.Paragraph)` } `; -export const Service = styled(Typography)` +export const Service = styled(Typography.Paragraph)` &&& { color: #acacac; font-size: 0.75rem; diff --git a/frontend/src/container/GantChart/Trace/styles.ts b/frontend/src/container/GantChart/Trace/styles.ts index 7710e77b5b..a85eec454c 100644 --- a/frontend/src/container/GantChart/Trace/styles.ts +++ b/frontend/src/container/GantChart/Trace/styles.ts @@ -41,8 +41,9 @@ export const CardContainer = styled.li<{ isMissing?: boolean }>` width: 100%; cursor: pointer; border-radius: 0.25rem; + z-index: 2; ${({ isMissing }): string => - isMissing ? `border: 1px dashed ${volcano[6]};` : ''} + isMissing ? `border: 1px dashed ${volcano[6]} !important;` : ''} `; interface Props { diff --git a/frontend/src/container/GridGraphComponent/index.tsx b/frontend/src/container/GridGraphComponent/index.tsx index d2139b1a08..3a1b84e963 100644 --- a/frontend/src/container/GridGraphComponent/index.tsx +++ b/frontend/src/container/GridGraphComponent/index.tsx @@ -1,6 +1,6 @@ import { Typography } from 'antd'; import { ChartData } from 'chart.js'; -import Graph, { GraphOnClickHandler } from 'components/Graph'; +import Graph, { GraphOnClickHandler, StaticLineProps } from 'components/Graph'; import { getYAxisFormattedValue } from 'components/Graph/yAxisConfig'; import ValueGraph from 'components/ValueGraph'; import { GRAPH_TYPES } from 'container/NewDashboard/ComponentsSlider'; @@ -18,6 +18,7 @@ function GridGraphComponent({ onClickHandler, name, yAxisUnit, + staticLine, }: GridGraphComponentProps): JSX.Element | null { const location = history.location.pathname; @@ -36,6 +37,7 @@ function GridGraphComponent({ onClickHandler, name, yAxisUnit, + staticLine, }} /> ); @@ -82,6 +84,7 @@ export interface GridGraphComponentProps { onClickHandler?: GraphOnClickHandler; name: string; yAxisUnit?: string; + staticLine?: StaticLineProps; } GridGraphComponent.defaultProps = { @@ -90,6 +93,7 @@ GridGraphComponent.defaultProps = { isStacked: undefined, onClickHandler: undefined, yAxisUnit: undefined, + staticLine: undefined, }; export default GridGraphComponent; diff --git a/frontend/src/container/ListAlertRules/ListAlert.tsx b/frontend/src/container/ListAlertRules/ListAlert.tsx index b851b0829a..4df6290725 100644 --- a/frontend/src/container/ListAlertRules/ListAlert.tsx +++ b/frontend/src/container/ListAlertRules/ListAlert.tsx @@ -64,9 +64,14 @@ function ListAlert({ allAlertRules, refetch }: ListAlertProps): JSX.Element { }, { title: 'Alert Name', - dataIndex: 'name', + dataIndex: 'alert', key: 'name', sorter: (a, b): number => a.name.charCodeAt(0) - b.name.charCodeAt(0), + render: (value, record): JSX.Element => ( + onEditHandler(record.id.toString())}> + {value} + + ), }, { title: 'Severity', @@ -83,7 +88,7 @@ function ListAlert({ allAlertRules, refetch }: ListAlertProps): JSX.Element { }, }, { - title: 'Tags', + title: 'Labels', dataIndex: 'labels', key: 'tags', align: 'center', @@ -100,7 +105,7 @@ function ListAlert({ allAlertRules, refetch }: ListAlertProps): JSX.Element { {withOutSeverityKeys.map((e) => { return ( - {e} + {e}: {value[e]} ); })} diff --git a/frontend/src/container/MetricsApplication/Tabs/DBCall.tsx b/frontend/src/container/MetricsApplication/Tabs/DBCall.tsx index 60441b7876..2c14c099f7 100644 --- a/frontend/src/container/MetricsApplication/Tabs/DBCall.tsx +++ b/frontend/src/container/MetricsApplication/Tabs/DBCall.tsx @@ -25,7 +25,7 @@ function DBCall({ getWidget }: DBCallProps): JSX.Element { fullViewOptions={false} widget={getWidget([ { - query: `sum(rate(signoz_db_latency_count{service_name="${servicename}"${resourceAttributePromQLQuery}}[1m])) by (db_system)`, + query: `sum(rate(signoz_db_latency_count{service_name="${servicename}"${resourceAttributePromQLQuery}}[5m])) by (db_system)`, legend: '{{db_system}}', }, ])} diff --git a/frontend/src/container/MetricsApplication/Tabs/External.tsx b/frontend/src/container/MetricsApplication/Tabs/External.tsx index 9811e2f269..4fd039e979 100644 --- a/frontend/src/container/MetricsApplication/Tabs/External.tsx +++ b/frontend/src/container/MetricsApplication/Tabs/External.tsx @@ -14,7 +14,7 @@ function External({ getWidget }: ExternalProps): JSX.Element { const { resourceAttributePromQLQuery } = useSelector( (state) => state.metrics, ); - const legend = '{{http_url}}'; + const legend = '{{address}}'; return ( <> @@ -28,7 +28,7 @@ function External({ getWidget }: ExternalProps): JSX.Element { fullViewOptions={false} widget={getWidget([ { - query: `max((sum(rate(signoz_external_call_latency_count{service_name="${servicename}", status_code="STATUS_CODE_ERROR"${resourceAttributePromQLQuery}}[1m]) OR rate(signoz_external_call_latency_count{service_name="${servicename}", http_status_code=~"5.."${resourceAttributePromQLQuery}}[1m]) OR vector(0)) by (http_url))*100/sum(rate(signoz_external_call_latency_count{service_name="${servicename}"${resourceAttributePromQLQuery}}[1m])) by (http_url)) < 1000 OR vector(0)`, + query: `max((sum(rate(signoz_external_call_latency_count{service_name="${servicename}", status_code="STATUS_CODE_ERROR"${resourceAttributePromQLQuery}}[5m]) OR vector(0)) by (address))*100/sum(rate(signoz_external_call_latency_count{service_name="${servicename}"${resourceAttributePromQLQuery}}[5m])) by (address)) < 1000 OR vector(0)`, legend: 'External Call Error Percentage', }, ])} @@ -68,7 +68,7 @@ function External({ getWidget }: ExternalProps): JSX.Element { fullViewOptions={false} widget={getWidget([ { - query: `sum(rate(signoz_external_call_latency_count{service_name="${servicename}"${resourceAttributePromQLQuery}}[5m])) by (http_url)`, + query: `sum(rate(signoz_external_call_latency_count{service_name="${servicename}"${resourceAttributePromQLQuery}}[5m])) by (address)`, legend, }, ])} @@ -87,7 +87,7 @@ function External({ getWidget }: ExternalProps): JSX.Element { fullViewOptions={false} widget={getWidget([ { - query: `(sum(rate(signoz_external_call_latency_sum{service_name="${servicename}"${resourceAttributePromQLQuery}}[5m])) by (http_url))/(sum(rate(signoz_external_call_latency_count{service_name="${servicename}"${resourceAttributePromQLQuery}}[5m])) by (http_url))`, + query: `(sum(rate(signoz_external_call_latency_sum{service_name="${servicename}"${resourceAttributePromQLQuery}}[5m])) by (address))/(sum(rate(signoz_external_call_latency_count{service_name="${servicename}"${resourceAttributePromQLQuery}}[5m])) by (address))`, legend, }, ])} diff --git a/frontend/src/container/MetricsApplication/Tabs/Overview.tsx b/frontend/src/container/MetricsApplication/Tabs/Overview.tsx index a53714d05d..803ed91bcc 100644 --- a/frontend/src/container/MetricsApplication/Tabs/Overview.tsx +++ b/frontend/src/container/MetricsApplication/Tabs/Overview.tsx @@ -193,7 +193,7 @@ function Application({ getWidget }: DashboardProps): JSX.Element { }} widget={getWidget([ { - query: `sum(rate(signoz_latency_count{service_name="${servicename}", span_kind="SPAN_KIND_SERVER"${resourceAttributePromQLQuery}}[2m]))`, + query: `sum(rate(signoz_latency_count{service_name="${servicename}", span_kind="SPAN_KIND_SERVER"${resourceAttributePromQLQuery}}[5m]))`, legend: 'Requests', }, ])} @@ -227,7 +227,7 @@ function Application({ getWidget }: DashboardProps): JSX.Element { }} widget={getWidget([ { - query: `max(sum(rate(signoz_calls_total{service_name="${servicename}", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"${resourceAttributePromQLQuery}}[1m]) OR rate(signoz_calls_total{service_name="${servicename}", span_kind="SPAN_KIND_SERVER", http_status_code=~"5.."${resourceAttributePromQLQuery}}[1m]))*100/sum(rate(signoz_calls_total{service_name="${servicename}", span_kind="SPAN_KIND_SERVER"${resourceAttributePromQLQuery}}[1m]))) < 1000 OR vector(0)`, + query: `max(sum(rate(signoz_calls_total{service_name="${servicename}", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"${resourceAttributePromQLQuery}}[5m]) OR rate(signoz_calls_total{service_name="${servicename}", span_kind="SPAN_KIND_SERVER", http_status_code=~"5.."${resourceAttributePromQLQuery}}[5m]))*100/sum(rate(signoz_calls_total{service_name="${servicename}", span_kind="SPAN_KIND_SERVER"${resourceAttributePromQLQuery}}[5m]))) < 1000 OR vector(0)`, legend: 'Error Percentage', }, ])} diff --git a/frontend/src/container/MetricsTable/index.tsx b/frontend/src/container/MetricsTable/index.tsx index ff700da83a..cc0778c80e 100644 --- a/frontend/src/container/MetricsTable/index.tsx +++ b/frontend/src/container/MetricsTable/index.tsx @@ -56,7 +56,7 @@ function Metrics(): JSX.Element { render: (value: number): string => (value / 1000000).toFixed(2), }, { - title: 'Error Rate (in %)', + title: 'Error Rate (% of requests)', dataIndex: 'errorRate', key: 'errorRate', sorter: (a: DataProps, b: DataProps): number => a.errorRate - b.errorRate, diff --git a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/index.tsx b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/index.tsx index 4dee33c779..55adbd740b 100644 --- a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/index.tsx +++ b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/index.tsx @@ -29,7 +29,7 @@ function PromQLQueryContainer({ toggleDelete, }: IPromQLQueryHandleChange): void => { const allQueries = queryData[WIDGET_PROMQL_QUERY_KEY_NAME]; - const currentIndexQuery = allQueries[queryIndex]; + const currentIndexQuery = allQueries[queryIndex as number]; if (query !== undefined) currentIndexQuery.query = query; if (legend !== undefined) currentIndexQuery.legend = legend; @@ -37,7 +37,7 @@ function PromQLQueryContainer({ currentIndexQuery.disabled = !currentIndexQuery.disabled; } if (toggleDelete) { - allQueries.splice(queryIndex, 1); + allQueries.splice(queryIndex as number, 1); } updateQueryData({ updatedQuery: { ...queryData } }); }; diff --git a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/query.tsx b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/query.tsx index 1a6dd2f9d2..6cffd55d8d 100644 --- a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/query.tsx +++ b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/query.tsx @@ -7,7 +7,7 @@ import { IPromQLQueryHandleChange } from './types'; interface IPromQLQueryBuilderProps { queryData: IPromQLQuery; - queryIndex: number; + queryIndex: number | string; handleQueryChange: (args: IPromQLQueryHandleChange) => void; } diff --git a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/types.ts b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/types.ts index f1c88dd488..668a0c1f87 100644 --- a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/types.ts +++ b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/promQL/types.ts @@ -1,7 +1,7 @@ import { IPromQLQuery } from 'types/api/dashboard/getAll'; export interface IPromQLQueryHandleChange { - queryIndex: number; + queryIndex: number | string; query?: IPromQLQuery['query']; legend?: IPromQLQuery['legend']; toggleDisable?: IPromQLQuery['disabled']; diff --git a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/formula.tsx b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/formula.tsx index 5be08f044e..02bc41198c 100644 --- a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/formula.tsx +++ b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/formula.tsx @@ -9,7 +9,7 @@ const { TextArea } = Input; interface IMetricsBuilderFormulaProps { formulaData: IMetricsBuilderFormula; - formulaIndex: number; + formulaIndex: number | string; handleFormulaChange: (args: IQueryBuilderFormulaHandleChange) => void; } function MetricsBuilderFormula({ diff --git a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/index.tsx b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/index.tsx index 5b05eeca91..fdb6d4b7bc 100644 --- a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/index.tsx +++ b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/index.tsx @@ -50,7 +50,7 @@ function QueryBuilderQueryContainer({ }: IQueryBuilderQueryHandleChange): void => { const allQueries = queryData[WIDGET_QUERY_BUILDER_QUERY_KEY_NAME].queryBuilder; - const currentIndexQuery = allQueries[queryIndex]; + const currentIndexQuery = allQueries[queryIndex as number]; if (aggregateFunction) { currentIndexQuery.aggregateOperator = aggregateFunction; } @@ -78,7 +78,7 @@ function QueryBuilderQueryContainer({ currentIndexQuery.disabled = !currentIndexQuery.disabled; } if (toggleDelete) { - allQueries.splice(queryIndex, 1); + allQueries.splice(queryIndex as number, 1); } updateQueryData({ updatedQuery: { ...queryData } }); }; @@ -92,7 +92,7 @@ function QueryBuilderQueryContainer({ queryData[WIDGET_QUERY_BUILDER_QUERY_KEY_NAME][ WIDGET_QUERY_BUILDER_FORMULA_KEY_NAME ]; - const currentIndexFormula = allFormulas[formulaIndex]; + const currentIndexFormula = allFormulas[formulaIndex as number]; if (expression) { currentIndexFormula.expression = expression; @@ -103,7 +103,7 @@ function QueryBuilderQueryContainer({ } if (toggleDelete) { - allFormulas.splice(formulaIndex, 1); + allFormulas.splice(formulaIndex as number, 1); } updateQueryData({ updatedQuery: { ...queryData } }); }; diff --git a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/query.tsx b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/query.tsx index fccf108b41..8f171baa3c 100644 --- a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/query.tsx +++ b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/query.tsx @@ -15,7 +15,7 @@ import { IQueryBuilderQueryHandleChange } from './types'; const { Option } = Select; interface IMetricsBuilderProps { - queryIndex: number; + queryIndex: number | string; selectedGraph: GRAPH_TYPES; queryData: IMetricsBuilderQuery; handleQueryChange: (args: IQueryBuilderQueryHandleChange) => void; diff --git a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/types.ts b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/types.ts index 8d177cffd8..c577b8d123 100644 --- a/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/types.ts +++ b/frontend/src/container/NewWidget/LeftContainer/QuerySection/QueryBuilder/queryBuilder/types.ts @@ -4,7 +4,7 @@ import { } from 'types/api/dashboard/getAll'; export interface IQueryBuilderQueryHandleChange { - queryIndex: number; + queryIndex: number | string; aggregateFunction?: IMetricsBuilderQuery['aggregateOperator']; metricName?: IMetricsBuilderQuery['metricName']; tagFilters?: IMetricsBuilderQuery['tagFilters']['items']; @@ -16,7 +16,7 @@ export interface IQueryBuilderQueryHandleChange { } export interface IQueryBuilderFormulaHandleChange { - formulaIndex: number; + formulaIndex: number | string; expression?: IMetricsBuilderFormula['expression']; toggleDisable?: IMetricsBuilderFormula['disabled']; toggleDelete?: boolean; diff --git a/frontend/src/container/TopNav/DateTimeSelection/config.ts b/frontend/src/container/TopNav/DateTimeSelection/config.ts index 29d031e25b..69bdde40c7 100644 --- a/frontend/src/container/TopNav/DateTimeSelection/config.ts +++ b/frontend/src/container/TopNav/DateTimeSelection/config.ts @@ -1,20 +1,24 @@ import ROUTES from 'constants/routes'; type FiveMin = '5min'; +type TenMin = '10min'; type FifteenMin = '15min'; type ThirtyMin = '30min'; type OneMin = '1min'; type SixHour = '6hr'; type OneHour = '1hr'; +type FourHour = '4hr'; type OneDay = '1day'; type OneWeek = '1week'; type Custom = 'custom'; export type Time = | FiveMin + | TenMin | FifteenMin | ThirtyMin | OneMin + | FourHour | SixHour | OneHour | Custom diff --git a/frontend/src/container/TopNav/index.tsx b/frontend/src/container/TopNav/index.tsx index ffd1b28175..ddf10023a3 100644 --- a/frontend/src/container/TopNav/index.tsx +++ b/frontend/src/container/TopNav/index.tsx @@ -19,6 +19,9 @@ const routesToSkip = [ ROUTES.ALL_DASHBOARD, ROUTES.ORG_SETTINGS, ROUTES.ERROR_DETAIL, + ROUTES.ALERTS_NEW, + ROUTES.EDIT_ALERTS, + ROUTES.LIST_ALL_ALERT, ]; function TopNav(): JSX.Element | null { diff --git a/frontend/src/container/Trace/Filters/index.tsx b/frontend/src/container/Trace/Filters/index.tsx index 276b69b227..95f73f4ed9 100644 --- a/frontend/src/container/Trace/Filters/index.tsx +++ b/frontend/src/container/Trace/Filters/index.tsx @@ -9,7 +9,9 @@ export const AllTraceFilterEnum: TraceFilterEnum[] = [ 'serviceName', 'operation', 'component', - 'httpCode', + 'rpcMethod', + 'responseStatusCode', + // 'httpCode', 'httpHost', 'httpMethod', 'httpRoute', diff --git a/frontend/src/container/Trace/TraceGraphFilter/config.ts b/frontend/src/container/Trace/TraceGraphFilter/config.ts index 882ccc2e96..357f22a3ee 100644 --- a/frontend/src/container/Trace/TraceGraphFilter/config.ts +++ b/frontend/src/container/Trace/TraceGraphFilter/config.ts @@ -38,6 +38,14 @@ export const groupBy: Dropdown[] = [ displayValue: 'HTTP status code', key: 'httpCode', }, + { + displayValue: 'RPC Method', + key: 'rpcMethod', + }, + { + displayValue: 'Status Code', + key: 'responseStatusCode', + }, { displayValue: 'Database name', key: 'dbName', diff --git a/frontend/src/container/TraceDetail/Missingtrace.tsx b/frontend/src/container/TraceDetail/Missingtrace.tsx index b203f05f68..eb0620a4ed 100644 --- a/frontend/src/container/TraceDetail/Missingtrace.tsx +++ b/frontend/src/container/TraceDetail/Missingtrace.tsx @@ -28,11 +28,10 @@ function MissingSpansMessage(): JSX.Element { justifyContent: 'center', alignItems: 'center', margin: '1rem 0', + fontSize: '0.8rem', }} > - {' '} + {' '} This trace has missing spans diff --git a/frontend/src/container/TraceDetail/SelectedSpanDetails/EllipsedButton.tsx b/frontend/src/container/TraceDetail/SelectedSpanDetails/EllipsedButton.tsx new file mode 100644 index 0000000000..56ef64e4ee --- /dev/null +++ b/frontend/src/container/TraceDetail/SelectedSpanDetails/EllipsedButton.tsx @@ -0,0 +1,53 @@ +import { StyledButton } from 'components/Styled'; +import React from 'react'; + +import { styles } from './styles'; + +function EllipsedButton({ + onToggleHandler, + setText, + value, + event, + buttonText, +}: Props): JSX.Element { + const isFullValueButton = buttonText === 'View full value'; + + const style = [styles.removePadding]; + + if (!isFullValueButton) { + style.push(styles.removeMargin); + } else { + style.push(styles.selectedSpanDetailsContainer); + style.push(styles.buttonContainer); + } + + return ( + { + onToggleHandler(true); + setText({ + subText: value, + text: event, + }); + }} + type="link" + > + {buttonText} + + ); +} + +interface Props { + onToggleHandler: (isOpen: boolean) => void; + setText: (text: { subText: string; text: string }) => void; + value: string; + event: string; + buttonText?: string; +} + +EllipsedButton.defaultProps = { + buttonText: 'View full log event message', +}; + +export default EllipsedButton; diff --git a/frontend/src/container/TraceDetail/SelectedSpanDetails/ErrorTag.tsx b/frontend/src/container/TraceDetail/SelectedSpanDetails/ErrorTag.tsx index 2a663387a5..69b51b3cd8 100644 --- a/frontend/src/container/TraceDetail/SelectedSpanDetails/ErrorTag.tsx +++ b/frontend/src/container/TraceDetail/SelectedSpanDetails/ErrorTag.tsx @@ -1,29 +1,22 @@ -import { Collapse, Modal } from 'antd'; -import Editor from 'components/Editor'; -import { StyledButton } from 'components/Styled'; +import { Collapse } from 'antd'; import useThemeMode from 'hooks/useThemeMode'; import keys from 'lodash-es/keys'; import map from 'lodash-es/map'; -import React, { useState } from 'react'; +import React from 'react'; import { ITraceTree } from 'types/api/trace/getTraceItem'; -import { CustomSubText, CustomSubTitle, styles } from './styles'; +import EllipsedButton from './EllipsedButton'; +import { CustomSubText, CustomSubTitle } from './styles'; const { Panel } = Collapse; -function ErrorTag({ event }: ErrorTagProps): JSX.Element { - const [isOpen, setIsOpen] = useState(false); +function ErrorTag({ + event, + onToggleHandler, + setText, +}: ErrorTagProps): JSX.Element { const { isDarkMode } = useThemeMode(); - const [text, setText] = useState({ - text: '', - subText: '', - }); - - const onToggleHandler = (state: boolean): void => { - setIsOpen(state); - }; - return ( <> {map(event, ({ attributeMap, name }) => { @@ -45,23 +38,23 @@ function ErrorTag({ event }: ErrorTagProps): JSX.Element { return ( <> {event} - + {value}
{isEllipsed && ( - { - onToggleHandler(true); - setText({ - subText: value, - text: event, - }); + - View full log event message - + /> )}
@@ -71,31 +64,14 @@ function ErrorTag({ event }: ErrorTagProps): JSX.Element { ); })} - - onToggleHandler(false)} - title="Log Message" - visible={isOpen} - destroyOnClose - footer={[]} - width="70vw" - > - {text.text} - - {text.text === 'exception.stacktrace' ? ( - {}} readOnly value={text.subText} /> - ) : ( - - {text.subText} - - )} - ); } interface ErrorTagProps { event: ITraceTree['event']; + onToggleHandler: (isOpen: boolean) => void; + setText: (text: { subText: string; text: string }) => void; } export default ErrorTag; diff --git a/frontend/src/container/TraceDetail/SelectedSpanDetails/index.tsx b/frontend/src/container/TraceDetail/SelectedSpanDetails/index.tsx index 08d6c057a9..49596d14d0 100644 --- a/frontend/src/container/TraceDetail/SelectedSpanDetails/index.tsx +++ b/frontend/src/container/TraceDetail/SelectedSpanDetails/index.tsx @@ -1,9 +1,11 @@ -import { Tabs, Tooltip, Typography } from 'antd'; +import { Modal, Tabs, Tooltip, Typography } from 'antd'; +import Editor from 'components/Editor'; import { StyledSpace } from 'components/Styled'; import useThemeMode from 'hooks/useThemeMode'; -import React, { useMemo } from 'react'; +import React, { useMemo, useState } from 'react'; import { ITraceTree } from 'types/api/trace/getTraceItem'; +import EllipsedButton from './EllipsedButton'; import ErrorTag from './ErrorTag'; import { CardContainer, @@ -12,6 +14,7 @@ import { CustomText, CustomTitle, styles, + SubTextContainer, } from './styles'; const { TabPane } = Tabs; @@ -26,6 +29,17 @@ function SelectedSpanDetails(props: SelectedSpanDetailsProps): JSX.Element { tree?.serviceName, ]); + const [isOpen, setIsOpen] = useState(false); + + const [text, setText] = useState({ + text: '', + subText: '', + }); + + const onToggleHandler = (state: boolean): void => { + setIsOpen(state); + }; + if (!tree) { return
; } @@ -52,18 +66,60 @@ function SelectedSpanDetails(props: SelectedSpanDetailsProps): JSX.Element { + onToggleHandler(false)} + title={text.text} + visible={isOpen} + destroyOnClose + footer={[]} + width="70vw" + centered + > + {text.text === 'exception.stacktrace' ? ( + {}} readOnly value={text.subText} /> + ) : ( + + {text.subText} + + )} + + {tags.length !== 0 ? ( tags.map((tags) => { + const value = tags.key === 'error' ? 'true' : tags.value; + const isEllipsed = value.length > 24; + return ( {tags.value && ( <> {tags.key} - - {tags.key === 'error' ? 'true' : tags.value} - + + value}> + + {value} + + + {isEllipsed && ( + + )} + + )} @@ -75,7 +131,11 @@ function SelectedSpanDetails(props: SelectedSpanDetailsProps): JSX.Element { {tree.event && Object.keys(tree.event).length !== 0 ? ( - + ) : ( No events data in selected span )} diff --git a/frontend/src/container/TraceDetail/SelectedSpanDetails/styles.ts b/frontend/src/container/TraceDetail/SelectedSpanDetails/styles.ts index d8bae86ba7..3c9180dc94 100644 --- a/frontend/src/container/TraceDetail/SelectedSpanDetails/styles.ts +++ b/frontend/src/container/TraceDetail/SelectedSpanDetails/styles.ts @@ -18,7 +18,8 @@ export const CustomText = styled(Paragraph)` export const CustomSubTitle = styled(Title)` &&& { font-size: 14px; - margin-bottom: 8px; + margin-bottom: 0.1rem; + margin-top: 0.5rem; } `; @@ -26,13 +27,19 @@ interface CustomSubTextProps { isDarkMode: boolean; } +export const SubTextContainer = styled.div` + &&& { + background: ${({ isDarkMode }): string => (isDarkMode ? '#444' : '#ddd')}; + } +`; + export const CustomSubText = styled(Paragraph)` &&& { background: ${({ isDarkMode }): string => (isDarkMode ? '#444' : '#ddd')}; font-size: 12px; - padding: 6px 8px; + padding: 4px 8px; word-break: break-all; - margin-bottom: 16px; + margin-bottom: 0rem; } `; @@ -81,10 +88,15 @@ const overflow = css` } `; +const buttonContainer = css` + height: 1.5rem; +`; + export const styles = { removeMargin, removePadding, selectedSpanDetailsContainer, spanEventsTabsContainer, overflow, + buttonContainer, }; diff --git a/frontend/src/lib/createQueryParams.ts b/frontend/src/lib/createQueryParams.ts index 0f1b3f7ad7..5630098cb2 100644 --- a/frontend/src/lib/createQueryParams.ts +++ b/frontend/src/lib/createQueryParams.ts @@ -1,6 +1,6 @@ -const createQueryParams = (params: { [x: string]: string }): string => +const createQueryParams = (params: { [x: string]: string | number }): string => Object.keys(params) - .map((k) => `${k}=${encodeURI(params[k])}`) + .map((k) => `${k}=${encodeURI(String(params[k]))}`) .join('&'); export default createQueryParams; diff --git a/frontend/src/lib/getMinMax.ts b/frontend/src/lib/getMinMax.ts index 9c1fab94c3..ae830cc06a 100644 --- a/frontend/src/lib/getMinMax.ts +++ b/frontend/src/lib/getMinMax.ts @@ -13,6 +13,9 @@ const GetMinMax = ( if (interval === '1min') { const minTimeAgo = getMinAgo({ minutes: 1 }).getTime(); minTime = minTimeAgo; + } else if (interval === '10min') { + const minTimeAgo = getMinAgo({ minutes: 10 }).getTime(); + minTime = minTimeAgo; } else if (interval === '15min') { const minTimeAgo = getMinAgo({ minutes: 15 }).getTime(); minTime = minTimeAgo; @@ -33,8 +36,9 @@ const GetMinMax = ( // one week = one day * 7 const minTimeAgo = getMinAgo({ minutes: 26 * 60 * 7 }).getTime(); minTime = minTimeAgo; - } else if (interval === '6hr') { - const minTimeAgo = getMinAgo({ minutes: 6 * 60 }).getTime(); + } else if (['4hr', '6hr'].includes(interval)) { + const h = parseInt(interval.replace('hr', ''), 10); + const minTimeAgo = getMinAgo({ minutes: h * 60 }).getTime(); minTime = minTimeAgo; } else if (interval === 'custom') { maxTime = (dateTimeRange || [])[1] || 0; diff --git a/frontend/src/pages/CreateAlert/index.tsx b/frontend/src/pages/CreateAlert/index.tsx index edfe543b1f..3bab0c1ee7 100644 --- a/frontend/src/pages/CreateAlert/index.tsx +++ b/frontend/src/pages/CreateAlert/index.tsx @@ -1,109 +1,9 @@ -import { SaveOutlined } from '@ant-design/icons'; -import { Button, notification } from 'antd'; -import createAlertsApi from 'api/alerts/create'; -import Editor from 'components/Editor'; -import ROUTES from 'constants/routes'; -import { State } from 'hooks/useFetch'; -import history from 'lib/history'; -import React, { useCallback, useState } from 'react'; -import { PayloadProps as CreateAlertPayloadProps } from 'types/api/alerts/create'; +import CreateAlertRule from 'container/CreateAlertRule'; +import React from 'react'; +import { alertDefaults } from 'types/api/alerts/create'; -import { ButtonContainer, Title } from './styles'; - -function CreateAlert(): JSX.Element { - const [value, setEditorValue] = useState( - `\n alert: High RPS\n expr: sum(rate(signoz_latency_count{span_kind="SPAN_KIND_SERVER"}[2m])) by (service_name) > 100\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: High RPS of Applications\n description: "RPS is > 100\n\t\t\t VALUE = {{ $value }}\n\t\t\t LABELS = {{ $labels }}"\n `, - ); - - const [newAlertState, setNewAlertState] = useState< - State - >({ - error: false, - errorMessage: '', - loading: false, - payload: undefined, - success: false, - }); - const [notifications, Element] = notification.useNotification(); - - const defaultError = - 'Oops! Some issue occured in saving the alert please try again or contact support@signoz.io'; - - const onSaveHandler = useCallback(async () => { - try { - setNewAlertState((state) => ({ - ...state, - loading: true, - })); - - if (value.length === 0) { - setNewAlertState((state) => ({ - ...state, - loading: false, - })); - notifications.error({ - description: `Oops! We didn't catch that. Please make sure the alert settings are not empty or try again`, - message: 'Error', - }); - return; - } - - const response = await createAlertsApi({ - query: value, - }); - - if (response.statusCode === 200) { - setNewAlertState((state) => ({ - ...state, - loading: false, - payload: response.payload, - })); - notifications.success({ - message: 'Success', - description: 'Congrats. The alert was saved correctly.', - }); - - setTimeout(() => { - history.push(ROUTES.LIST_ALL_ALERT); - }, 3000); - } else { - notifications.error({ - description: response.error || defaultError, - message: 'Error', - }); - setNewAlertState((state) => ({ - ...state, - loading: false, - error: true, - errorMessage: response.error || defaultError, - })); - } - } catch (error) { - notifications.error({ - message: defaultError, - }); - } - }, [notifications, value]); - - return ( - <> - {Element} - - Create New Alert - setEditorValue(value)} value={value} /> - - - - - - ); +function CreateAlertPage(): JSX.Element { + return ; } -export default CreateAlert; +export default CreateAlertPage; diff --git a/frontend/src/pages/EditRules/index.tsx b/frontend/src/pages/EditRules/index.tsx index 09cda600ab..0217e40efc 100644 --- a/frontend/src/pages/EditRules/index.tsx +++ b/frontend/src/pages/EditRules/index.tsx @@ -47,7 +47,12 @@ function EditRules(): JSX.Element { return ; } - return ; + return ( + + ); } export default EditRules; diff --git a/frontend/src/pages/ErrorDetails/index.tsx b/frontend/src/pages/ErrorDetails/index.tsx index 5f9e6c8e9f..348391b741 100644 --- a/frontend/src/pages/ErrorDetails/index.tsx +++ b/frontend/src/pages/ErrorDetails/index.tsx @@ -4,107 +4,87 @@ import getById from 'api/errors/getById'; import Spinner from 'components/Spinner'; import ROUTES from 'constants/routes'; import ErrorDetailsContainer from 'container/ErrorDetails'; -import React from 'react'; +import React, { useMemo } from 'react'; import { useTranslation } from 'react-i18next'; import { useQuery } from 'react-query'; import { useSelector } from 'react-redux'; import { Redirect, useLocation } from 'react-router-dom'; import { AppState } from 'store/reducers'; -import { PayloadProps } from 'types/api/errors/getById'; import { GlobalReducer } from 'types/reducer/globalTime'; +import { urlKey } from './utils'; + +// eslint-disable-next-line sonarjs/cognitive-complexity function ErrorDetails(): JSX.Element { const { t } = useTranslation(['common']); const { maxTime, minTime } = useSelector( (state) => state.globalTime, ); const { search } = useLocation(); - const params = new URLSearchParams(search); + const params = useMemo(() => new URLSearchParams(search), [search]); + + const groupId = params.get(urlKey.groupId); + const errorId = params.get(urlKey.errorId); + const timestamp = params.get(urlKey.timestamp); - const errorId = params.get('errorId'); - const errorType = params.get('errorType'); - const serviceName = params.get('serviceName'); const defaultError = t('something_went_wrong'); - const { data, status } = useQuery( - [ - 'errorByType', - errorType, - 'serviceName', - serviceName, - maxTime, - minTime, - errorId, - ], - { - queryFn: () => - getByErrorType({ - end: maxTime, - errorType: errorType || '', - serviceName: serviceName || '', - start: minTime, - }), - enabled: errorId === null && errorType !== null && serviceName !== null, - cacheTime: 5000, - }, - ); - - const { status: ErrorIdStatus, data: errorIdPayload } = useQuery( - [ - 'errorByType', - errorType, - 'serviceName', - serviceName, - maxTime, - minTime, - 'errorId', - errorId, - ], + const { data: IdData, status: IdStatus } = useQuery( + [errorId, timestamp, groupId], { queryFn: () => getById({ - end: maxTime, - errorId: errorId || data?.payload?.errorId || '', - start: minTime, + errorID: errorId || '', + groupID: groupId || '', + timestamp: timestamp || '', }), enabled: - (errorId !== null || status === 'success') && - errorType !== null && - serviceName !== null, - cacheTime: 5000, + errorId !== null && + groupId !== null && + timestamp !== null && + errorId.length !== 0 && + groupId.length !== 0 && + timestamp.length !== 0, }, ); + const { data, status } = useQuery([maxTime, minTime, groupId], { + queryFn: () => + getByErrorType({ + groupID: groupId || '', + timestamp: timestamp || '', + }), + enabled: !!groupId && IdStatus !== 'success', + }); + // if errorType and serviceName is null redirecting to the ALL_ERROR page not now - if (errorType === null || serviceName === null) { + if (groupId === null || timestamp === null) { return ; } // when the api is in loading state - if (status === 'loading' || ErrorIdStatus === 'loading') { + if (status === 'loading' || IdStatus === 'loading') { return ; } // if any error occurred while loading - if (status === 'error' || ErrorIdStatus === 'error') { - return ( - - {data?.error || errorIdPayload?.error || defaultError} - - ); + if (status === 'error' || IdStatus === 'error') { + return {data?.error || defaultError}; } + const idPayload = data?.payload || IdData?.payload; + // if API is successfully but there is an error if ( (status === 'success' && data?.statusCode >= 400) || - (ErrorIdStatus === 'success' && errorIdPayload.statusCode >= 400) + (IdStatus === 'success' && IdData.statusCode >= 400) || + idPayload === null || + idPayload === undefined ) { return {data?.error || defaultError}; } - return ( - - ); + return ; } export interface ErrorDetailsParams { diff --git a/frontend/src/pages/ErrorDetails/utils.ts b/frontend/src/pages/ErrorDetails/utils.ts new file mode 100644 index 0000000000..e26db2290f --- /dev/null +++ b/frontend/src/pages/ErrorDetails/utils.ts @@ -0,0 +1,8 @@ +export const urlKey = { + serviceName: 'serviceName', + exceptionType: 'exceptionType', + groupId: 'groupId', + lastSeen: 'lastSeen', + errorId: 'errorId', + timestamp: 'timestamp', +}; diff --git a/frontend/src/pages/SignUp/SignUp.tsx b/frontend/src/pages/SignUp/SignUp.tsx index d184e74a4e..9465d870c4 100644 --- a/frontend/src/pages/SignUp/SignUp.tsx +++ b/frontend/src/pages/SignUp/SignUp.tsx @@ -262,12 +262,13 @@ function SignUp({ version }: SignUpProps): JSX.Element { setState(updateValue, setConfirmPassword); }} required - id="UpdatePassword" + id="confirmPassword" /> {confirmPasswordError && ( , + Omit { + aggregateOperator: EAggregateOperator | undefined; + disabled: boolean; + name: string; + legend?: string; + metricName: string | null; + groupBy?: string[]; + expression?: string; + tagFilters?: IQueryBuilderTagFilters; + toggleDisable?: boolean; + toggleDelete?: boolean; +} + +export interface IFormulaQueries { + [key: string]: IFormulaQuery; +} + +export interface IFormulaQuery extends IMetricsBuilderFormula { + formulaOnly: boolean; + queryName: string; +} + +export interface IMetricQueries { + [key: string]: IMetricQuery; +} + +export interface IMetricQuery extends IMetricsBuilderQuery { + formulaOnly: boolean; + expression?: string; + queryName: string; +} diff --git a/frontend/src/types/api/alerts/create.ts b/frontend/src/types/api/alerts/create.ts index 6a2e5c09ab..6f179af79a 100644 --- a/frontend/src/types/api/alerts/create.ts +++ b/frontend/src/types/api/alerts/create.ts @@ -1,8 +1,48 @@ +import { AlertDef } from 'types/api/alerts/def'; + +import { defaultCompareOp, defaultEvalWindow, defaultMatchType } from './def'; + export interface Props { - query: string; + data: AlertDef; } export interface PayloadProps { status: string; data: string; } + +export const alertDefaults: AlertDef = { + condition: { + compositeMetricQuery: { + builderQueries: { + A: { + queryName: 'A', + name: 'A', + formulaOnly: false, + metricName: '', + tagFilters: { + op: 'AND', + items: [], + }, + groupBy: [], + aggregateOperator: 1, + expression: 'A', + disabled: false, + toggleDisable: false, + toggleDelete: false, + }, + }, + promQueries: {}, + queryType: 1, + }, + op: defaultCompareOp, + matchType: defaultMatchType, + }, + labels: { + severity: 'warning', + }, + annotations: { + description: 'A new alert', + }, + evalWindow: defaultEvalWindow, +}; diff --git a/frontend/src/types/api/alerts/def.ts b/frontend/src/types/api/alerts/def.ts new file mode 100644 index 0000000000..060bdc4d73 --- /dev/null +++ b/frontend/src/types/api/alerts/def.ts @@ -0,0 +1,32 @@ +import { ICompositeMetricQuery } from 'types/api/alerts/compositeQuery'; + +// default match type for threshold +export const defaultMatchType = '1'; + +// default eval window +export const defaultEvalWindow = '5m0s'; + +// default compare op: above +export const defaultCompareOp = '1'; + +export interface AlertDef { + id?: number; + alert?: string; + ruleType?: string; + condition: RuleCondition; + labels?: Labels; + annotations?: Labels; + evalWindow?: string; + source?: string; +} + +export interface RuleCondition { + compositeMetricQuery: ICompositeMetricQuery; + op?: string | undefined; + target?: number | undefined; + matchType?: string | undefined; +} + +export interface Labels { + [key: string]: string; +} diff --git a/frontend/src/types/api/alerts/get.ts b/frontend/src/types/api/alerts/get.ts index 52e9a78e7b..69eef474e1 100644 --- a/frontend/src/types/api/alerts/get.ts +++ b/frontend/src/types/api/alerts/get.ts @@ -1,9 +1,9 @@ -import { Alerts } from './getAll'; +import { AlertDef } from './def'; export interface Props { - id: Alerts['id']; + id: AlertDef['id']; } export type PayloadProps = { - data: string; + data: AlertDef; }; diff --git a/frontend/src/types/api/alerts/put.ts b/frontend/src/types/api/alerts/put.ts deleted file mode 100644 index e70de0b630..0000000000 --- a/frontend/src/types/api/alerts/put.ts +++ /dev/null @@ -1,9 +0,0 @@ -import { PayloadProps as DeletePayloadProps } from './delete'; -import { Alerts } from './getAll'; - -export type PayloadProps = DeletePayloadProps; - -export interface Props { - id: Alerts['id']; - data: DeletePayloadProps['data']; -} diff --git a/frontend/src/types/api/alerts/queryType.ts b/frontend/src/types/api/alerts/queryType.ts new file mode 100644 index 0000000000..277d6f0703 --- /dev/null +++ b/frontend/src/types/api/alerts/queryType.ts @@ -0,0 +1,17 @@ +export type QueryType = 1 | 2 | 3; + +export const QUERY_BUILDER: QueryType = 1; +export const PROMQL: QueryType = 3; + +export const resolveQueryCategoryName = (s: number): string => { + switch (s) { + case 1: + return 'Query Builder'; + case 2: + return 'Clickhouse Query'; + case 3: + return 'PromQL'; + default: + return ''; + } +}; diff --git a/frontend/src/types/api/alerts/save.ts b/frontend/src/types/api/alerts/save.ts new file mode 100644 index 0000000000..a815c728d2 --- /dev/null +++ b/frontend/src/types/api/alerts/save.ts @@ -0,0 +1,11 @@ +import { AlertDef } from './def'; + +export type PayloadProps = { + status: string; + data: string; +}; + +export interface Props { + id?: number; + data: AlertDef; +} diff --git a/frontend/src/types/api/errors/getAll.ts b/frontend/src/types/api/errors/getAll.ts index 98c3122f7d..d0bbd7995a 100644 --- a/frontend/src/types/api/errors/getAll.ts +++ b/frontend/src/types/api/errors/getAll.ts @@ -1,8 +1,20 @@ import { GlobalTime } from 'types/actions/globalTime'; +export type Order = 'ascending' | 'descending'; +export type OrderBy = + | 'serviceName' + | 'exceptionCount' + | 'lastSeen' + | 'firstSeen' + | 'exceptionType'; + export interface Props { start: GlobalTime['minTime']; end: GlobalTime['maxTime']; + order?: Order; + orderParam?: OrderBy; + limit?: number; + offset?: number; } export interface Exception { @@ -12,6 +24,7 @@ export interface Exception { lastSeen: string; firstSeen: string; serviceName: string; + groupID: string; } export type PayloadProps = Exception[]; diff --git a/frontend/src/types/api/errors/getByErrorId.ts b/frontend/src/types/api/errors/getByErrorId.ts new file mode 100644 index 0000000000..cceef67ded --- /dev/null +++ b/frontend/src/types/api/errors/getByErrorId.ts @@ -0,0 +1,9 @@ +import { PayloadProps as Prop } from './getByErrorTypeAndService'; + +export interface Props { + groupID: string; + errorID: string; + timestamp: string; +} + +export type PayloadProps = Prop; diff --git a/frontend/src/types/api/errors/getByErrorTypeAndService.ts b/frontend/src/types/api/errors/getByErrorTypeAndService.ts index 4f987874b7..dc15c786ee 100644 --- a/frontend/src/types/api/errors/getByErrorTypeAndService.ts +++ b/frontend/src/types/api/errors/getByErrorTypeAndService.ts @@ -1,10 +1,6 @@ -import { GlobalTime } from 'types/actions/globalTime'; - export interface Props { - start: GlobalTime['minTime']; - end: GlobalTime['maxTime']; - serviceName: string; - errorType: string; + timestamp: string; + groupID: string; } export interface PayloadProps { @@ -16,7 +12,6 @@ export interface PayloadProps { timestamp: string; spanID: string; traceID: string; - serviceName: Props['serviceName']; - newerErrorId: string; - olderErrorId: string; + serviceName: string; + groupID: string; } diff --git a/frontend/src/types/api/errors/getById.ts b/frontend/src/types/api/errors/getErrorCounts.ts similarity index 53% rename from frontend/src/types/api/errors/getById.ts rename to frontend/src/types/api/errors/getErrorCounts.ts index c812410b89..ab690bd0c6 100644 --- a/frontend/src/types/api/errors/getById.ts +++ b/frontend/src/types/api/errors/getErrorCounts.ts @@ -1,11 +1,8 @@ import { GlobalTime } from 'types/actions/globalTime'; -import { PayloadProps as Payload } from './getByErrorTypeAndService'; - -export type PayloadProps = Payload; - export type Props = { start: GlobalTime['minTime']; end: GlobalTime['minTime']; - errorId: string; }; + +export type PayloadProps = number; diff --git a/frontend/src/types/api/errors/getNextPrevId.ts b/frontend/src/types/api/errors/getNextPrevId.ts new file mode 100644 index 0000000000..99a336fdd8 --- /dev/null +++ b/frontend/src/types/api/errors/getNextPrevId.ts @@ -0,0 +1,13 @@ +export type Props = { + errorID: string; + timestamp: string; + groupID: string; +}; + +export type PayloadProps = { + prevErrorID: string; + nextErrorID: string; + groupID: string; + nextTimestamp: string; + prevTimestamp: string; +}; diff --git a/frontend/src/types/api/trace/getTraceItem.ts b/frontend/src/types/api/trace/getTraceItem.ts index a653823821..4b12d15b2f 100644 --- a/frontend/src/types/api/trace/getTraceItem.ts +++ b/frontend/src/types/api/trace/getTraceItem.ts @@ -18,10 +18,10 @@ export type Span = [ string, string, string, - string | string[], - string | string[], - string | string[], - Record[], + string[], + string[], + string[], + string[], boolean, ]; diff --git a/frontend/src/types/reducer/trace.ts b/frontend/src/types/reducer/trace.ts index fc1c08f4fc..fed82dd0be 100644 --- a/frontend/src/types/reducer/trace.ts +++ b/frontend/src/types/reducer/trace.ts @@ -69,7 +69,9 @@ export type TraceFilterEnum = | 'httpUrl' | 'operation' | 'serviceName' - | 'status'; + | 'status' + | 'responseStatusCode' + | 'rpcMethod'; export const AllPanelHeading: { key: TraceFilterEnum; @@ -107,6 +109,14 @@ export const AllPanelHeading: { key: 'operation', displayValue: 'Operation', }, + { + key: 'responseStatusCode', + displayValue: 'Status Code', + }, + { + key: 'rpcMethod', + displayValue: 'RPC Method', + }, { key: 'serviceName', displayValue: 'Service Name', diff --git a/frontend/src/utils/__tests__/__snapshots__/spanToTree.test.ts.snap b/frontend/src/utils/__tests__/__snapshots__/spanToTree.test.ts.snap new file mode 100644 index 0000000000..2c2ab402e2 --- /dev/null +++ b/frontend/src/utils/__tests__/__snapshots__/spanToTree.test.ts.snap @@ -0,0 +1,211 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`utils/spanToTree should return a single tree on valid trace data 1`] = ` +Object { + "missingSpanTree": Array [], + "spanTree": Array [ + Object { + "children": Array [ + Object { + "children": Array [ + Object { + "children": Array [], + "event": Array [ + Object { + "attributeMap": Object { + "event": "HTTP request received S3", + "level": "info", + "method": "GET", + "url": "/dispatch?customer=392&nonse=0.015296363321630757", + }, + "timeUnixNano": 1657275433246142000, + }, + ], + "hasError": false, + "id": "span_3", + "isProcessed": true, + "name": "HTTP GET SPAN 3", + "references": Array [ + Object { + "RefType": "CHILD_OF", + "SpanId": "span_2", + "TraceId": "0000000000000000span_1", + }, + ], + "serviceColour": "", + "serviceName": "frontend", + "startTime": 1657275433246, + "tags": Array [ + Object { + "key": "host.name.span3", + "value": "span_3", + }, + ], + "time": 683273000, + "value": 683273000, + }, + ], + "event": Array [ + Object { + "attributeMap": Object { + "event": "HTTP request received S2", + "level": "info", + "method": "GET", + "url": "/dispatch?customer=392&nonse=0.015296363321630757", + }, + "timeUnixNano": 1657275433246142000, + }, + ], + "hasError": false, + "id": "span_2", + "isProcessed": true, + "name": "HTTP GET SPAN 2", + "references": Array [ + Object { + "RefType": "CHILD_OF", + "SpanId": "span_1", + "TraceId": "0000000000000000span_1", + }, + ], + "serviceColour": "", + "serviceName": "frontend", + "startTime": 1657275433246, + "tags": Array [ + Object { + "key": "host.name.span2", + "value": "span_2", + }, + ], + "time": 683273000, + "value": 683273000, + }, + ], + "event": Array [ + Object { + "attributeMap": Object { + "event": "HTTP request received S1", + "level": "info", + "method": "GET", + "url": "/dispatch?customer=392&nonse=0.015296363321630757", + }, + "timeUnixNano": 1657275433246142000, + }, + ], + "hasError": false, + "id": "span_1", + "name": "HTTP GET SPAN 1", + "references": Array [ + Object { + "RefType": "CHILD_OF", + "SpanId": "", + "TraceId": "0000000000000000span_1", + }, + ], + "serviceColour": "", + "serviceName": "frontend", + "startTime": 1657275433246, + "tags": Array [ + Object { + "key": "host.name.span1", + "value": "span_1", + }, + ], + "time": 683273000, + "value": 683273000, + }, + ], +} +`; + +exports[`utils/spanToTree should return a single tree on valid trace data 2`] = ` +Object { + "missingSpanTree": Array [ + Object { + "children": Array [ + Object { + "children": Array [], + "event": Array [ + Object { + "attributeMap": Object { + "event": "HTTP request received S3", + "level": "info", + "method": "GET", + "url": "/dispatch?customer=392&nonse=0.015296363321630757", + }, + "timeUnixNano": 1657275433246142000, + }, + ], + "hasError": false, + "id": "span_3", + "isProcessed": true, + "name": "HTTP GET SPAN 3", + "references": Array [ + Object { + "RefType": "CHILD_OF", + "SpanId": "span_2", + "TraceId": "0000000000000000span_1", + }, + ], + "serviceColour": "", + "serviceName": "frontend", + "startTime": 1657275433246, + "tags": Array [ + Object { + "key": "host.name.span3", + "value": "span_3", + }, + ], + "time": 683273000, + "value": 683273000, + }, + ], + "id": "span_2", + "isMissing": true, + "name": "Missing Span (span_2)", + "serviceColour": "", + "serviceName": "", + "startTime": null, + "tags": Array [], + "time": null, + "value": null, + }, + ], + "spanTree": Array [ + Object { + "children": Array [], + "event": Array [ + Object { + "attributeMap": Object { + "event": "HTTP request received S1", + "level": "info", + "method": "GET", + "url": "/dispatch?customer=392&nonse=0.015296363321630757", + }, + "timeUnixNano": 1657275433246142000, + }, + ], + "hasError": false, + "id": "span_1", + "name": "HTTP GET SPAN 1", + "references": Array [ + Object { + "RefType": "CHILD_OF", + "SpanId": "", + "TraceId": "0000000000000000span_1", + }, + ], + "serviceColour": "", + "serviceName": "frontend", + "startTime": 1657275433246, + "tags": Array [ + Object { + "key": "host.name.span1", + "value": "span_1", + }, + ], + "time": 683273000, + "value": 683273000, + }, + ], +} +`; diff --git a/frontend/src/utils/__tests__/spanToTree.test.ts b/frontend/src/utils/__tests__/spanToTree.test.ts new file mode 100644 index 0000000000..4cf7a20fb4 --- /dev/null +++ b/frontend/src/utils/__tests__/spanToTree.test.ts @@ -0,0 +1,21 @@ +import { TraceData } from '../fixtures/TraceData'; +import { spanToTreeUtil } from '../spanToTree'; + +describe('utils/spanToTree', () => { + test('should return a single tree on valid trace data', () => { + const spanTree = spanToTreeUtil(TraceData); + expect(spanTree.spanTree.length).toBe(1); + expect(spanTree.missingSpanTree.length).toBe(0); + expect(spanTree).toMatchSnapshot(); + }); + test('should return a single tree on valid trace data', () => { + const MissingTraceData = [...TraceData]; + MissingTraceData.splice(1, 1); + + const spanTree = spanToTreeUtil(MissingTraceData); + + expect(spanTree.spanTree.length).toBe(1); + expect(spanTree.missingSpanTree.length).toBe(1); + expect(spanTree).toMatchSnapshot(); + }); +}); diff --git a/frontend/src/utils/fixtures/TraceData.ts b/frontend/src/utils/fixtures/TraceData.ts new file mode 100644 index 0000000000..289e91e949 --- /dev/null +++ b/frontend/src/utils/fixtures/TraceData.ts @@ -0,0 +1,52 @@ +import { Span } from 'types/api/trace/getTraceItem'; + +export const TraceData: Span[] = [ + [ + 1657275433246, + 'span_1', + '0000000000000000span_1', + 'frontend', + 'HTTP GET SPAN 1', + '2', + '683273000', + ['host.name.span1'], + ['span_1'], + ['{TraceId=0000000000000000span_1, SpanId=, RefType=CHILD_OF}'], + [ + '{"timeUnixNano":1657275433246142000,"attributeMap":{"event":"HTTP request received S1","level":"info","method":"GET","url":"/dispatch?customer=392\\u0026nonse=0.015296363321630757"}}', + ], + false, + ], + [ + 1657275433246, + 'span_2', + '0000000000000000span_1', + 'frontend', + 'HTTP GET SPAN 2', + '2', + '683273000', + ['host.name.span2'], + ['span_2'], + ['{TraceId=0000000000000000span_1, SpanId=span_1, RefType=CHILD_OF}'], + [ + '{"timeUnixNano":1657275433246142000,"attributeMap":{"event":"HTTP request received S2","level":"info","method":"GET","url":"/dispatch?customer=392\\u0026nonse=0.015296363321630757"}}', + ], + false, + ], + [ + 1657275433246, + 'span_3', + '0000000000000000span_1', + 'frontend', + 'HTTP GET SPAN 3', + '2', + '683273000', + ['host.name.span3'], + ['span_3'], + ['{TraceId=0000000000000000span_1, SpanId=span_2, RefType=CHILD_OF}'], + [ + '{"timeUnixNano":1657275433246142000,"attributeMap":{"event":"HTTP request received S3","level":"info","method":"GET","url":"/dispatch?customer=392\\u0026nonse=0.015296363321630757"}}', + ], + false, + ], +]; diff --git a/frontend/src/utils/spanToTree.ts b/frontend/src/utils/spanToTree.ts index 115c4c111a..142df3dec8 100644 --- a/frontend/src/utils/spanToTree.ts +++ b/frontend/src/utils/spanToTree.ts @@ -109,6 +109,12 @@ export const spanToTreeUtil = (inputSpanList: Span[]): ITraceForest => { const missingSpanTree: ITraceTree[] = []; const referencedTraceIds: string[] = Array.from(traceIdSet); Object.keys(spanMap).forEach((spanId) => { + const isRoot = spanMap[spanId].references?.some((refs) => refs.SpanId === ''); + if (isRoot) { + spanTree.push(spanMap[spanId]); + return; + } + for (const traceId of referencedTraceIds) { if (traceId.includes(spanId)) { spanTree.push(spanMap[spanId]); diff --git a/frontend/tests/auth.json b/frontend/tests/auth.json new file mode 100644 index 0000000000..2dd3d40466 --- /dev/null +++ b/frontend/tests/auth.json @@ -0,0 +1,38 @@ +{ + "cookies": [], + "origins": [ + { + "origin": "http://localhost:3301", + "localStorage": [ + { + "name": "isSideBarCollapsed", + "value": "false" + }, + { + "name": "metricsTimeDurations", + "value": "{}" + }, + { + "name": "i18nextLng", + "value": "en-US" + }, + { + "name": "reactQueryDevtoolsSortFn", + "value": "\"Status > Last Updated\"" + }, + { + "name": "AUTH_TOKEN", + "value": "authtoken" + }, + { + "name": "IS_LOGGED_IN", + "value": "true" + }, + { + "name": "REFRESH_AUTH_TOKEN", + "value": "refreshJwt" + } + ] + } + ] +} \ No newline at end of file diff --git a/frontend/tests/fixtures/api/login/200.json b/frontend/tests/fixtures/api/login/200.json new file mode 100644 index 0000000000..2ea22f87d8 --- /dev/null +++ b/frontend/tests/fixtures/api/login/200.json @@ -0,0 +1,7 @@ +{ + "accessJwt": "authtoken", + "accessJwtExpiry": 1656609177, + "refreshJwt": "refreshJwt", + "refreshJwtExpiry": 1659199377, + "userId": "34917776-514b-4b95-a4f5-1a5cc06e34b6" +} diff --git a/frontend/tests/fixtures/api/organisation/201.json b/frontend/tests/fixtures/api/organisation/201.json new file mode 100644 index 0000000000..deea4b3512 --- /dev/null +++ b/frontend/tests/fixtures/api/organisation/201.json @@ -0,0 +1,3 @@ +{ + "data": "org updated successfully" +} diff --git a/frontend/tests/fixtures/api/register/200.json b/frontend/tests/fixtures/api/register/200.json new file mode 100644 index 0000000000..6088583942 --- /dev/null +++ b/frontend/tests/fixtures/api/register/200.json @@ -0,0 +1 @@ +{ "data": "user registered successfully" } diff --git a/frontend/tests/fixtures/api/register/401.json b/frontend/tests/fixtures/api/register/401.json new file mode 100644 index 0000000000..6fd241b44c --- /dev/null +++ b/frontend/tests/fixtures/api/register/401.json @@ -0,0 +1,5 @@ +{ + "status": "error", + "errorType": "unauthorized", + "error": "You are not allowed to create an account. Please ask your admin to send an invite link" +} diff --git a/frontend/tests/fixtures/api/userId/200.json b/frontend/tests/fixtures/api/userId/200.json new file mode 100644 index 0000000000..527c60eab6 --- /dev/null +++ b/frontend/tests/fixtures/api/userId/200.json @@ -0,0 +1,11 @@ +{ + "createdAt": 1651759141, + "email": "prashant@signoz.io", + "groupId": "36261238-3214-4ae9-9ef1-661a9f7be5d0", + "id": "509fab4a-2578-4f24-8245-1b77b2d6d937", + "name": "Prashant", + "orgId": "72b4024a-3301-4d90-951e-ee071b96dba5", + "organization": "Meta", + "profilePictureURL": "", + "role": "ADMIN" +} diff --git a/frontend/tests/fixtures/common.ts b/frontend/tests/fixtures/common.ts new file mode 100644 index 0000000000..d691cae423 --- /dev/null +++ b/frontend/tests/fixtures/common.ts @@ -0,0 +1,43 @@ +import { Page } from '@playwright/test'; +import { getVersion } from 'constants/api'; + +import loginApiResponse from './api/login/200.json'; +import updateOrgResponse from './api/organisation/201.json'; +import successLoginResponse from './api/register/200.json'; +import userLoginResponse from './api/userId/200.json'; +import { version } from './constant'; + +export const waitForVersionApiSuccess = async (page: Page): Promise => { + await page.route(`**/${getVersion}`, (route) => + route.fulfill({ + status: 200, + body: JSON.stringify({ version }), + }), + ); +}; + +export const loginApi = async (page: Page): Promise => { + await Promise.all([ + page.route(`**/register`, (route) => + route.fulfill({ + status: 200, + body: JSON.stringify(successLoginResponse), + }), + ), + page.route(`**/user/${loginApiResponse.userId}`, (route) => + route.fulfill({ status: 200, body: JSON.stringify(userLoginResponse) }), + ), + page.route('**/login', (route) => + route.fulfill({ + status: 200, + body: JSON.stringify(loginApiResponse), + }), + ), + page.route(`**/org/${userLoginResponse.orgId}`, (route) => + route.fulfill({ + status: 200, + body: JSON.stringify(updateOrgResponse), + }), + ), + ]); +}; diff --git a/frontend/tests/fixtures/constant.ts b/frontend/tests/fixtures/constant.ts new file mode 100644 index 0000000000..ac20029c4a --- /dev/null +++ b/frontend/tests/fixtures/constant.ts @@ -0,0 +1,8 @@ +export const version = 'v1.0.0'; +export const validemail = 'sample@signoz.io'; +export const validName = 'Palash'; +export const validCompanyName = 'Signoz'; +export const validPassword = 'SamplePassword98@@'; + +export const getStartedButtonSelector = 'button[data-attr="signup"]'; +export const confirmPasswordSelector = '#password-confirm-error'; diff --git a/frontend/tests/login/fail.spec.ts b/frontend/tests/login/fail.spec.ts new file mode 100644 index 0000000000..5366d7240c --- /dev/null +++ b/frontend/tests/login/fail.spec.ts @@ -0,0 +1,28 @@ +import { expect, test } from '@playwright/test'; +import { getVersion } from 'constants/api'; +import ROUTES from 'constants/routes'; + +test.describe('Version API fail while loading login page', async () => { + test('Something went wrong', async ({ page, baseURL }) => { + const loginPage = `${baseURL}${ROUTES.LOGIN}`; + + const text = 'Something went wrong'; + + await page.route(`**/${getVersion}`, (route) => + route.fulfill({ + status: 500, + body: JSON.stringify({ error: text }), + }), + ); + + await page.goto(loginPage, { + waitUntil: 'networkidle', + }); + + const el = page.locator(`text=${text}`); + + expect(el).toBeVisible(); + expect(el).toHaveText(`${text}`); + expect(await el.getAttribute('disabled')).toBe(null); + }); +}); diff --git a/frontend/tests/login/index.spec.ts b/frontend/tests/login/index.spec.ts new file mode 100644 index 0000000000..ec735460ab --- /dev/null +++ b/frontend/tests/login/index.spec.ts @@ -0,0 +1,49 @@ +import { expect, test } from '@playwright/test'; +import ROUTES from 'constants/routes'; + +import { waitForVersionApiSuccess } from '../fixtures/common'; +import { version } from '../fixtures/constant'; + +test.describe('Login Page', () => { + test.beforeEach(async ({ baseURL, page }) => { + const loginPage = `${baseURL}${ROUTES.LOGIN}`; + + await waitForVersionApiSuccess(page); + + await Promise.all([page.goto(loginPage), page.waitForRequest('**/version')]); + }); + + test('Login Page text should be visible', async ({ page }) => { + const signup = 'Monitor your applications. Find what is causing issues.'; + + // Click text=Monitor your applications. Find what is causing issues. + const el = page.locator(`text=${signup}`); + + expect(el).toBeVisible(); + }); + + test('Create an account button should be present', async ({ + page, + baseURL, + }) => { + const loginPage = `${baseURL}${ROUTES.LOGIN}`; + + // find button which has text=Create an account + const button = page.locator('text=Create an account'); + + expect(button).toBeVisible(); + expect(button).toHaveText('Create an account'); + expect(await button.getAttribute('disabled')).toBe(null); + + expect(await button.isEnabled()).toBe(true); + await expect(page).toHaveURL(loginPage); + }); + + test('Version of the application when api returns 200', async ({ page }) => { + // Click text=SigNoz ${version} + const element = page.locator(`text=SigNoz ${version}`); + element.isVisible(); + const text = await element.innerText(); + expect(text).toBe(`SigNoz ${version}`); + }); +}); diff --git a/frontend/tests/service/index.spec.ts b/frontend/tests/service/index.spec.ts new file mode 100644 index 0000000000..ae708322ed --- /dev/null +++ b/frontend/tests/service/index.spec.ts @@ -0,0 +1,22 @@ +import { expect, Page, test } from '@playwright/test'; +import ROUTES from 'constants/routes'; + +import { loginApi } from '../fixtures/common'; + +let page: Page; + +test.describe('Service Page', () => { + test.beforeEach(async ({ baseURL, browser }) => { + const context = await browser.newContext({ storageState: 'tests/auth.json' }); + const newPage = await context.newPage(); + + await loginApi(newPage); + + await newPage.goto(`${baseURL}${ROUTES.APPLICATION}`); + + page = newPage; + }); + test('Serice Page is rendered', async ({ baseURL }) => { + await expect(page).toHaveURL(`${baseURL}${ROUTES.APPLICATION}`); + }); +}); diff --git a/frontend/tests/signup/index.spec.ts b/frontend/tests/signup/index.spec.ts index a7e06f4fa6..afdc98f140 100644 --- a/frontend/tests/signup/index.spec.ts +++ b/frontend/tests/signup/index.spec.ts @@ -1,17 +1,224 @@ -import { expect, test } from '@playwright/test'; +import { expect, Page, PlaywrightTestOptions, test } from '@playwright/test'; import ROUTES from 'constants/routes'; -test('Login Page', async ({ page, baseURL }) => { - const loginPage = `${baseURL}${ROUTES.LOGIN}`; +import { loginApi, waitForVersionApiSuccess } from '../fixtures/common'; +import { + confirmPasswordSelector, + getStartedButtonSelector, + validCompanyName, + validemail, + validName, + validPassword, +} from '../fixtures/constant'; - await page.goto(loginPage, { - waitUntil: 'networkidle', +const waitForSignUpPageSuccess = async ( + baseURL: PlaywrightTestOptions['baseURL'], + page: Page, +): Promise => { + const signupPage = `${baseURL}${ROUTES.SIGN_UP}`; + + await page.goto(signupPage); + + await waitForVersionApiSuccess(page); +}; + +interface FillDetailsInSignUpFormProps { + page: Page; + email: string; + name: string; + companyName: string; + password: string; + confirmPassword: string; +} + +const fillDetailsInSignUpForm = async ({ + page, + email, + name, + companyName, + password, + confirmPassword, +}: FillDetailsInSignUpFormProps): Promise => { + const emailplaceholder = '[placeholder="name\\@yourcompany\\.com"]'; + const nameplaceholder = '[placeholder="Your Name"]'; + const companyPlaceholder = '[placeholder="Your Company"]'; + const currentPasswordId = '#currentPassword'; + const confirmPasswordId = '#confirmPassword'; + + // Fill [placeholder="name\@yourcompany\.com"] + await page.locator(emailplaceholder).fill(email); + + // Fill [placeholder="Your Name"] + await page.locator(nameplaceholder).fill(name); + + // Fill [placeholder="Your Company"] + await page.locator(companyPlaceholder).fill(companyName); + + // Fill #currentPassword + await page.locator(currentPasswordId).fill(password); + + // Fill #confirmPasswordId + await page.locator(confirmPasswordId).fill(confirmPassword); +}; + +test.describe('Sign Up Page', () => { + test('When User successfull signup and logged in, he should be redirected to dashboard', async ({ + page, + baseURL, + }) => { + const loginPage = `${baseURL}${ROUTES.LOGIN}`; + + await waitForVersionApiSuccess(page); + + await Promise.all([page.goto(loginPage), page.waitForRequest('**/version')]); + + const buttonSignupButton = page.locator('text=Create an account'); + + await buttonSignupButton.click(); + + expect(page).toHaveURL(`${baseURL}${ROUTES.SIGN_UP}`); }); - const signup = 'Monitor your applications. Find what is causing issues.'; + test('Invite link validation', async ({ baseURL, page }) => { + await waitForSignUpPageSuccess(baseURL, page); + const message = + 'This will create an admin account. If you are not an admin, please ask your admin for an invite link'; - // Click text=Monitor your applications. Find what is causing issues. - const el = page.locator(`text=${signup}`); + const messageText = await page.locator(`text=${message}`).innerText(); - expect(el).toBeVisible(); + expect(messageText).toBe(message); + }); + + test('User Sign up with valid details', async ({ baseURL, page, context }) => { + await waitForSignUpPageSuccess(baseURL, page); + + const gettingStartedButton = page.locator(getStartedButtonSelector); + + expect(await gettingStartedButton.isDisabled()).toBe(true); + + await fillDetailsInSignUpForm({ + companyName: validCompanyName, + confirmPassword: validPassword, + email: validemail, + name: validName, + page, + password: validPassword, + }); + + // password validation message is not present + const locator = await page.locator(confirmPasswordSelector).isVisible(); + expect(locator).toBe(false); + + const buttonText = await gettingStartedButton.evaluate((e) => e.innerHTML); + + expect(buttonText).toMatch(/Get Started/i); + + // Getting Started button is not disabled + expect(await gettingStartedButton.isDisabled()).toBe(false); + + await loginApi(page); + + await gettingStartedButton.click(); + + await expect(page).toHaveURL(`${baseURL}${ROUTES.APPLICATION}`); + + await context.storageState({ + path: 'tests/auth.json', + }); + }); + + test('Empty name with valid details', async ({ baseURL, page }) => { + await waitForSignUpPageSuccess(baseURL, page); + + await fillDetailsInSignUpForm({ + companyName: validCompanyName, + confirmPassword: validPassword, + email: validemail, + name: '', + page, + password: validPassword, + }); + + const gettingStartedButton = page.locator(getStartedButtonSelector); + + expect(await gettingStartedButton.isDisabled()).toBe(true); + }); + + test('Empty Company name with valid details', async ({ baseURL, page }) => { + await waitForSignUpPageSuccess(baseURL, page); + + await fillDetailsInSignUpForm({ + companyName: '', + confirmPassword: validPassword, + email: validemail, + name: validName, + page, + password: validPassword, + }); + + const gettingStartedButton = page.locator(getStartedButtonSelector); + + expect(await gettingStartedButton.isDisabled()).toBe(true); + }); + + test('Empty Email with valid details', async ({ baseURL, page }) => { + await waitForSignUpPageSuccess(baseURL, page); + + await fillDetailsInSignUpForm({ + companyName: validCompanyName, + confirmPassword: validPassword, + email: '', + name: validName, + page, + password: validPassword, + }); + + const gettingStartedButton = page.locator(getStartedButtonSelector); + + expect(await gettingStartedButton.isDisabled()).toBe(true); + }); + + test('Empty Password and confirm password with valid details', async ({ + baseURL, + page, + }) => { + await waitForSignUpPageSuccess(baseURL, page); + + await fillDetailsInSignUpForm({ + companyName: validCompanyName, + confirmPassword: '', + email: validemail, + name: validName, + page, + password: '', + }); + + const gettingStartedButton = page.locator(getStartedButtonSelector); + + expect(await gettingStartedButton.isDisabled()).toBe(true); + + // password validation message is not present + const locator = await page.locator(confirmPasswordSelector).isVisible(); + expect(locator).toBe(false); + }); + + test('Miss Match Password and confirm password with valid details', async ({ + baseURL, + page, + }) => { + await waitForSignUpPageSuccess(baseURL, page); + + await fillDetailsInSignUpForm({ + companyName: validCompanyName, + confirmPassword: validPassword, + email: validemail, + name: validName, + page, + password: '', + }); + + // password validation message is not present + const locator = await page.locator(confirmPasswordSelector).isVisible(); + expect(locator).toBe(true); + }); }); diff --git a/frontend/tsconfig.json b/frontend/tsconfig.json index ca86de66b0..92ea1e3649 100644 --- a/frontend/tsconfig.json +++ b/frontend/tsconfig.json @@ -36,6 +36,7 @@ "./commitlint.config.ts", "./webpack.config.js", "./webpack.config.prod.js", - "./jest.setup.ts" + "./jest.setup.ts", + "./tests/**.ts", ] } diff --git a/frontend/yarn.lock b/frontend/yarn.lock index 3a254370a1..f2d9ad04ad 100644 --- a/frontend/yarn.lock +++ b/frontend/yarn.lock @@ -4093,6 +4093,11 @@ chartjs-adapter-date-fns@^2.0.0: resolved "https://registry.yarnpkg.com/chartjs-adapter-date-fns/-/chartjs-adapter-date-fns-2.0.0.tgz#5e53b2f660b993698f936f509c86dddf9ed44c6b" integrity sha512-rmZINGLe+9IiiEB0kb57vH3UugAtYw33anRiw5kS2Tu87agpetDDoouquycWc9pRsKtQo5j+vLsYHyr8etAvFw== +chartjs-plugin-annotation@^1.4.0: + version "1.4.0" + resolved "https://registry.yarnpkg.com/chartjs-plugin-annotation/-/chartjs-plugin-annotation-1.4.0.tgz#4c84cec1ec838bc09712f3686237866e6c3f4798" + integrity sha512-OC0eGoVvdxTtGGi8mV3Dr+G1YmMhtYYQWqGMb2uWcgcnyiBslaRKPofKwAYWPbh7ABnmQNsNDQLIKPH+XiaZLA== + "chokidar@>=3.0.0 <4.0.0", chokidar@^3.5.3: version "3.5.3" resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.5.3.tgz#1cf37c8707b932bd1af1ae22c0432e2acd1903bd" @@ -12408,6 +12413,11 @@ timed-out@^4.0.1: resolved "https://registry.yarnpkg.com/timed-out/-/timed-out-4.0.1.tgz#f32eacac5a175bea25d7fab565ab3ed8741ef56f" integrity sha1-8y6srFoXW+ol1/q1Zas+2HQe9W8= +timestamp-nano@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/timestamp-nano/-/timestamp-nano-1.0.0.tgz#03bf0b43c2bdcb913a6a02fbaae6f97d68650f3a" + integrity sha512-NO/1CZigzlCWQiWdIGv8ebXt6Uk77zdLz2NE7KcZRU5Egj2+947lzUpk30xQUQlq5dRY25j7ZulG4RfA2DHYfA== + tiny-invariant@^1.0.2: version "1.2.0" resolved "https://registry.yarnpkg.com/tiny-invariant/-/tiny-invariant-1.2.0.tgz#a1141f86b672a9148c72e978a19a73b9b94a15a9" diff --git a/pkg/query-service/app/clickhouseReader/options.go b/pkg/query-service/app/clickhouseReader/options.go index fc25fb43ce..930a5e9d70 100644 --- a/pkg/query-service/app/clickhouseReader/options.go +++ b/pkg/query-service/app/clickhouseReader/options.go @@ -22,7 +22,7 @@ const ( defaultTraceDB string = "signoz_traces" defaultOperationsTable string = "signoz_operations" defaultIndexTable string = "signoz_index_v2" - defaultErrorTable string = "signoz_error_index" + defaultErrorTable string = "signoz_error_index_v2" defaulDurationTable string = "durationSortMV" defaultSpansTable string = "signoz_spans" defaultLogsDB string = "signoz_logs" diff --git a/pkg/query-service/app/clickhouseReader/reader.go b/pkg/query-service/app/clickhouseReader/reader.go index 687fcb3e46..4060b08eed 100644 --- a/pkg/query-service/app/clickhouseReader/reader.go +++ b/pkg/query-service/app/clickhouseReader/reader.go @@ -3,16 +3,12 @@ package clickhouseReader import ( "bytes" "context" - "crypto/md5" - "database/sql" "encoding/json" - "flag" + "fmt" "io/ioutil" "math/rand" - "net" "net/http" - "net/url" "os" "reflect" "regexp" @@ -27,20 +23,16 @@ import ( "github.com/google/uuid" "github.com/oklog/oklog/pkg/group" "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/promlog" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery" sd_config "github.com/prometheus/prometheus/discovery/config" - "github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/promql" - "github.com/prometheus/prometheus/rules" + "github.com/prometheus/prometheus/scrape" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/storage/remote" "github.com/prometheus/prometheus/util/stats" - "github.com/prometheus/prometheus/util/strutil" - "github.com/prometheus/tsdb" "github.com/ClickHouse/clickhouse-go/v2" "github.com/ClickHouse/clickhouse-go/v2/lib/driver" @@ -61,7 +53,7 @@ const ( signozTraceDBName = "signoz_traces" signozDurationMVTable = "durationSort" signozSpansTable = "signoz_spans" - signozErrorIndexTable = "signoz_error_index" + signozErrorIndexTable = "signoz_error_index_v2" signozTraceTableName = "signoz_index_v2" signozMetricDBName = "signoz_metrics" signozSampleTableName = "samples_v2" @@ -98,24 +90,30 @@ type ClickHouseReader struct { logsResourceKeys string queryEngine *promql.Engine remoteStorage *remote.Storage - ruleManager *rules.Manager - promConfig *config.Config - alertManager am.Manager + + promConfigFile string + promConfig *config.Config + alertManager am.Manager } // NewTraceReader returns a TraceReader for the database -func NewReader(localDB *sqlx.DB) *ClickHouseReader { +func NewReader(localDB *sqlx.DB, configFile string) *ClickHouseReader { datasource := os.Getenv("ClickHouseUrl") options := NewOptions(datasource, primaryNamespace, archiveNamespace) db, err := initialize(options) if err != nil { - zap.S().Error(err) + zap.S().Error("failed to initialize ClickHouse: ", err) os.Exit(1) } - alertManager := am.New("") + alertManager, err := am.New("") + if err != nil { + zap.S().Errorf("msg: failed to initialize alert manager: ", "/t error:", err) + zap.S().Errorf("msg: check if the alert manager URL is correctly set and valid") + os.Exit(1) + } return &ClickHouseReader{ db: db, @@ -131,6 +129,7 @@ func NewReader(localDB *sqlx.DB) *ClickHouseReader { logsTable: options.primary.LogsTable, logsAttributeKeys: options.primary.LogsAttributeKeysTable, logsResourceKeys: options.primary.LogsResourceKeysTable, + promConfigFile: configFile, } } @@ -149,30 +148,14 @@ func (r *ClickHouseReader) Start() { startTime := func() (int64, error) { return int64(promModel.Latest), nil - } remoteStorage := remote.NewStorage(log.With(logger, "component", "remote"), startTime, time.Duration(1*time.Minute)) - // conf, err := config.LoadFile(*filename) - // if err != nil { - // zap.S().Error("couldn't load configuration (--config.file=%q): %v", filename, err) - // } - - // err = remoteStorage.ApplyConfig(conf) - // if err != nil { - // zap.S().Error("Error in remoteStorage.ApplyConfig: ", err) - // } cfg := struct { configFile string localStoragePath string - notifier notifier.Options - notifierTimeout promModel.Duration - forGracePeriod promModel.Duration - outageTolerance promModel.Duration - resendDelay promModel.Duration - tsdb tsdb.Options lookbackDelta promModel.Duration webTimeout promModel.Duration queryTimeout promModel.Duration @@ -184,39 +167,15 @@ func (r *ClickHouseReader) Start() { logLevel promlog.AllowedLevel }{ - notifier: notifier.Options{ - Registerer: prometheus.DefaultRegisterer, - }, + configFile: r.promConfigFile, } - flag.StringVar(&cfg.configFile, "config", "./config/prometheus.yml", "(prometheus config to read metrics)") - flag.Parse() - // fanoutStorage := remoteStorage fanoutStorage := storage.NewFanout(logger, remoteStorage) - localStorage := remoteStorage - - cfg.notifier.QueueCapacity = 10000 - cfg.notifierTimeout = promModel.Duration(time.Duration.Seconds(10)) - notifier := notifier.NewManager(&cfg.notifier, log.With(logger, "component", "notifier")) - // notifier.ApplyConfig(conf) - - ExternalURL, err := computeExternalURL("", "0.0.0.0:3301") - if err != nil { - fmt.Fprintln(os.Stderr, errors.Wrapf(err, "parse external URL %q", ExternalURL.String())) - os.Exit(2) - } - - cfg.outageTolerance = promModel.Duration(time.Duration.Hours(1)) - cfg.forGracePeriod = promModel.Duration(time.Duration.Minutes(10)) - cfg.resendDelay = promModel.Duration(time.Duration.Minutes(1)) ctxScrape, cancelScrape := context.WithCancel(context.Background()) discoveryManagerScrape := discovery.NewManager(ctxScrape, log.With(logger, "component", "discovery manager scrape"), discovery.Name("scrape")) - ctxNotify, cancelNotify := context.WithCancel(context.Background()) - discoveryManagerNotify := discovery.NewManager(ctxNotify, log.With(logger, "component", "discovery manager notify"), discovery.Name("notify")) - scrapeManager := scrape.NewManager(log.With(logger, "component", "scrape manager"), fanoutStorage) opts := promql.EngineOpts{ @@ -229,25 +188,10 @@ func (r *ClickHouseReader) Start() { queryEngine := promql.NewEngine(opts) - ruleManager := rules.NewManager(&rules.ManagerOptions{ - Appendable: fanoutStorage, - TSDB: localStorage, - QueryFunc: rules.EngineQueryFunc(queryEngine, fanoutStorage), - NotifyFunc: sendAlerts(notifier, ExternalURL.String()), - Context: context.Background(), - ExternalURL: ExternalURL, - Registerer: prometheus.DefaultRegisterer, - Logger: log.With(logger, "component", "rule manager"), - OutageTolerance: time.Duration(cfg.outageTolerance), - ForGracePeriod: time.Duration(cfg.forGracePeriod), - ResendDelay: time.Duration(cfg.resendDelay), - }) - reloaders := []func(cfg *config.Config) error{ remoteStorage.ApplyConfig, - // The Scrape and notifier managers need to reload before the Discovery manager as + // The Scrape managers need to reload before the Discovery manager as // they need to read the most updated config when receiving the new targets list. - notifier.ApplyConfig, scrapeManager.ApplyConfig, func(cfg *config.Config) error { c := make(map[string]sd_config.ServiceDiscoveryConfig) @@ -256,32 +200,6 @@ func (r *ClickHouseReader) Start() { } return discoveryManagerScrape.ApplyConfig(c) }, - func(cfg *config.Config) error { - c := make(map[string]sd_config.ServiceDiscoveryConfig) - for _, v := range cfg.AlertingConfig.AlertmanagerConfigs { - // AlertmanagerConfigs doesn't hold an unique identifier so we use the config hash as the identifier. - b, err := json.Marshal(v) - if err != nil { - return err - } - c[fmt.Sprintf("%x", md5.Sum(b))] = v.ServiceDiscoveryConfig - } - return discoveryManagerNotify.ApplyConfig(c) - }, - // func(cfg *config.Config) error { - // // Get all rule files matching the configuration oaths. - // var files []string - // for _, pat := range cfg.RuleFiles { - // fs, err := filepath.Glob(pat) - // if err != nil { - // // The only error can be a bad pattern. - // return fmt.Errorf("error retrieving rule files for %s: %s", pat, err) - // } - // files = append(files, fs...) - // } - // return ruleManager.Update(time.Duration(cfg.GlobalConfig.EvaluationInterval), files) - // }, - } // sync.Once is used to make sure we can close the channel at different execution stages(SIGTERM or when the config is loaded). @@ -315,20 +233,6 @@ func (r *ClickHouseReader) Start() { }, ) } - { - // Notify discovery manager. - g.Add( - func() error { - err := discoveryManagerNotify.Run() - level.Info(logger).Log("msg", "Notify discovery manager stopped") - return err - }, - func(err error) { - level.Info(logger).Log("msg", "Stopping notify discovery manager...") - cancelNotify() - }, - ) - } { // Scrape manager. g.Add( @@ -364,6 +268,7 @@ func (r *ClickHouseReader) Start() { // reloadReady.Close() // return nil // } + var err error r.promConfig, err = reloadConfig(cfg.configFile, logger, reloaders...) if err != nil { return fmt.Errorf("error loading config from %q: %s", cfg.configFile, err) @@ -371,29 +276,19 @@ func (r *ClickHouseReader) Start() { reloadReady.Close() - rules, apiErrorObj := r.GetRulesFromDB() + // ! commented the alert manager can now + // call query service to do this + // channels, apiErrorObj := r.GetChannels() - if apiErrorObj != nil { - zap.S().Errorf("Not able to read rules from DB") - } - for _, rule := range *rules { - apiErrorObj = r.LoadRule(rule) - if apiErrorObj != nil { - zap.S().Errorf("Not able to load rule with id=%d loaded from DB", rule.Id, rule.Data) - } - } - - channels, apiErrorObj := r.GetChannels() - - if apiErrorObj != nil { - zap.S().Errorf("Not able to read channels from DB") - } - for _, channel := range *channels { - apiErrorObj = r.LoadChannel(&channel) - if apiErrorObj != nil { - zap.S().Errorf("Not able to load channel with id=%d loaded from DB", channel.Id, channel.Data) - } - } + //if apiErrorObj != nil { + // zap.S().Errorf("Not able to read channels from DB") + //} + //for _, channel := range *channels { + //apiErrorObj = r.LoadChannel(&channel) + //if apiErrorObj != nil { + // zap.S().Errorf("Not able to load channel with id=%d loaded from DB", channel.Id, channel.Data) + //} + //} <-cancel @@ -404,48 +299,8 @@ func (r *ClickHouseReader) Start() { }, ) } - { - // Rule manager. - // TODO(krasi) refactor ruleManager.Run() to be blocking to avoid using an extra blocking channel. - cancel := make(chan struct{}) - g.Add( - func() error { - <-reloadReady.C - ruleManager.Run() - <-cancel - return nil - }, - func(err error) { - ruleManager.Stop() - close(cancel) - }, - ) - } - { - // Notifier. - - // Calling notifier.Stop() before ruleManager.Stop() will cause a panic if the ruleManager isn't running, - // so keep this interrupt after the ruleManager.Stop(). - g.Add( - func() error { - // When the notifier manager receives a new targets list - // it needs to read a valid config for each job. - // It depends on the config being in sync with the discovery manager - // so we wait until the config is fully loaded. - <-reloadReady.C - - notifier.Run(discoveryManagerNotify.SyncCh()) - level.Info(logger).Log("msg", "Notifier manager stopped") - return nil - }, - func(err error) { - notifier.Stop() - }, - ) - } r.queryEngine = queryEngine r.remoteStorage = remoteStorage - r.ruleManager = ruleManager if err := g.Run(); err != nil { level.Error(logger).Log("err", err) @@ -476,70 +331,6 @@ func reloadConfig(filename string, logger log.Logger, rls ...func(*config.Config return conf, nil } -func startsOrEndsWithQuote(s string) bool { - return strings.HasPrefix(s, "\"") || strings.HasPrefix(s, "'") || - strings.HasSuffix(s, "\"") || strings.HasSuffix(s, "'") -} - -// computeExternalURL computes a sanitized external URL from a raw input. It infers unset -// URL parts from the OS and the given listen address. -func computeExternalURL(u, listenAddr string) (*url.URL, error) { - if u == "" { - hostname, err := os.Hostname() - if err != nil { - return nil, err - } - _, port, err := net.SplitHostPort(listenAddr) - if err != nil { - return nil, err - } - u = fmt.Sprintf("http://%s:%s/", hostname, port) - } - - if startsOrEndsWithQuote(u) { - return nil, fmt.Errorf("URL must not begin or end with quotes") - } - - eu, err := url.Parse(u) - if err != nil { - return nil, err - } - - ppref := strings.TrimRight(eu.Path, "/") - if ppref != "" && !strings.HasPrefix(ppref, "/") { - ppref = "/" + ppref - } - eu.Path = ppref - - return eu, nil -} - -// sendAlerts implements the rules.NotifyFunc for a Notifier. -func sendAlerts(n *notifier.Manager, externalURL string) rules.NotifyFunc { - return func(ctx context.Context, expr string, alerts ...*rules.Alert) { - var res []*notifier.Alert - - for _, alert := range alerts { - a := ¬ifier.Alert{ - StartsAt: alert.FiredAt, - Labels: alert.Labels, - Annotations: alert.Annotations, - GeneratorURL: externalURL + strutil.TableLinkForExpression(expr), - } - if !alert.ResolvedAt.IsZero() { - a.EndsAt = alert.ResolvedAt - } else { - a.EndsAt = alert.ValidUntil - } - res = append(res, a) - } - - if len(alerts) > 0 { - n.Send(res...) - } - } -} - func initialize(options *Options) (clickhouse.Conn, error) { db, err := connect(options.getPrimary()) @@ -558,156 +349,8 @@ func connect(cfg *namespaceConfig) (clickhouse.Conn, error) { return cfg.Connector(cfg) } -type byAlertStateAndNameSorter struct { - alerts []*AlertingRuleWithGroup -} - -func (s byAlertStateAndNameSorter) Len() int { - return len(s.alerts) -} - -func (s byAlertStateAndNameSorter) Less(i, j int) bool { - return s.alerts[i].State() > s.alerts[j].State() || - (s.alerts[i].State() == s.alerts[j].State() && - s.alerts[i].Name() < s.alerts[j].Name()) -} - -func (s byAlertStateAndNameSorter) Swap(i, j int) { - s.alerts[i], s.alerts[j] = s.alerts[j], s.alerts[i] -} - -type AlertingRuleWithGroup struct { - rules.AlertingRule - Id int -} - -func (r *ClickHouseReader) GetRulesFromDB() (*[]model.RuleResponseItem, *model.ApiError) { - - rules := []model.RuleResponseItem{} - - query := fmt.Sprintf("SELECT id, updated_at, data FROM rules") - - err := r.localDB.Select(&rules, query) - - zap.S().Info(query) - - if err != nil { - zap.S().Debug("Error in processing sql query: ", err) - return nil, &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - return &rules, nil -} - -func (r *ClickHouseReader) GetRule(id string) (*model.RuleResponseItem, *model.ApiError) { - - idInt, err := strconv.Atoi(id) - if err != nil { - zap.S().Debug("Error in parsing param: ", err) - return nil, &model.ApiError{Typ: model.ErrorBadData, Err: err} - } - - rule := &model.RuleResponseItem{} - - query := "SELECT id, updated_at, data FROM rules WHERE id=?" - rows, err := r.localDB.Query(query, idInt) - - if err != nil { - zap.S().Debug("Error in processing sql query: ", err) - return nil, &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - count := 0 - // iterate over each row - for rows.Next() { - err = rows.Scan(&rule.Id, &rule.UpdatedAt, &rule.Data) - if err != nil { - zap.S().Debug(err) - return nil, &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - count += 1 - - } - - if count == 0 { - err = fmt.Errorf("no rule with id %d found", idInt) - zap.S().Debug(err) - return nil, &model.ApiError{Typ: model.ErrorNotFound, Err: err} - } - if count > 1 { - err = fmt.Errorf("multiple rules with id %d found", idInt) - zap.S().Debug(err) - return nil, &model.ApiError{Typ: model.ErrorConflict, Err: err} - } - - return rule, nil -} - -func (r *ClickHouseReader) ListRulesFromProm() (*model.AlertDiscovery, *model.ApiError) { - - groups := r.ruleManager.RuleGroups() - - alertingRulesWithGroupObjects := []*AlertingRuleWithGroup{} - - for _, group := range groups { - groupNameParts := strings.Split(group.Name(), "-groupname") - if len(groupNameParts) < 2 { - continue - } - id, _ := strconv.Atoi(groupNameParts[0]) - for _, rule := range group.Rules() { - if alertingRule, ok := rule.(*rules.AlertingRule); ok { - alertingRulesWithGroupObject := AlertingRuleWithGroup{ - *alertingRule, - id, - } - alertingRulesWithGroupObjects = append(alertingRulesWithGroupObjects, &alertingRulesWithGroupObject) - } - } - } - - // alertingRules := r.ruleManager.AlertingRules() - - alertsSorter := byAlertStateAndNameSorter{alerts: alertingRulesWithGroupObjects} - sort.Sort(alertsSorter) - alerts := []*model.AlertingRuleResponse{} - - for _, alertingRule := range alertsSorter.alerts { - - alertingRuleResponseObject := &model.AlertingRuleResponse{ - Labels: alertingRule.Labels(), - // Annotations: alertingRule.Annotations(), - Name: alertingRule.Name(), - Id: alertingRule.Id, - } - if len(alertingRule.ActiveAlerts()) == 0 { - alertingRuleResponseObject.State = rules.StateInactive.String() - } else { - alertingRuleResponseObject.State = (*(alertingRule.ActiveAlerts()[0])).State.String() - } - - alerts = append( - alerts, - alertingRuleResponseObject, - ) - } - - res := &model.AlertDiscovery{Alerts: alerts} - - return res, nil -} - -func (r *ClickHouseReader) LoadRule(rule model.RuleResponseItem) *model.ApiError { - - groupName := fmt.Sprintf("%d-groupname", rule.Id) - - err := r.ruleManager.AddGroup(time.Duration(r.promConfig.GlobalConfig.EvaluationInterval), rule.Data, groupName) - - if err != nil { - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - return nil +func (r *ClickHouseReader) GetConn() clickhouse.Conn { + return r.db } func (r *ClickHouseReader) LoadChannel(channel *model.ChannelItem) *model.ApiError { @@ -952,138 +595,6 @@ func (r *ClickHouseReader) CreateChannel(receiver *am.Receiver) (*am.Receiver, * } -func (r *ClickHouseReader) CreateRule(rule string) *model.ApiError { - - tx, err := r.localDB.Begin() - if err != nil { - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - var lastInsertId int64 - - { - stmt, err := tx.Prepare(`INSERT into rules (updated_at, data) VALUES($1,$2);`) - if err != nil { - zap.S().Errorf("Error in preparing statement for INSERT to rules\n", err) - tx.Rollback() - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - defer stmt.Close() - - result, err := stmt.Exec(time.Now(), rule) - if err != nil { - zap.S().Errorf("Error in Executing prepared statement for INSERT to rules\n", err) - tx.Rollback() // return an error too, we may want to wrap them - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - lastInsertId, _ = result.LastInsertId() - - groupName := fmt.Sprintf("%d-groupname", lastInsertId) - - err = r.ruleManager.AddGroup(time.Duration(r.promConfig.GlobalConfig.EvaluationInterval), rule, groupName) - - if err != nil { - tx.Rollback() - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - } - err = tx.Commit() - if err != nil { - zap.S().Errorf("Error in committing transaction for INSERT to rules\n", err) - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - return nil -} - -func (r *ClickHouseReader) EditRule(rule string, id string) *model.ApiError { - - idInt, _ := strconv.Atoi(id) - - tx, err := r.localDB.Begin() - if err != nil { - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - { - stmt, err := tx.Prepare(`UPDATE rules SET updated_at=$1, data=$2 WHERE id=$3;`) - if err != nil { - zap.S().Errorf("Error in preparing statement for UPDATE to rules\n", err) - tx.Rollback() - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - defer stmt.Close() - - if _, err := stmt.Exec(time.Now(), rule, idInt); err != nil { - zap.S().Errorf("Error in Executing prepared statement for UPDATE to rules\n", err) - tx.Rollback() // return an error too, we may want to wrap them - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - groupName := fmt.Sprintf("%d-groupname", idInt) - - err = r.ruleManager.EditGroup(time.Duration(r.promConfig.GlobalConfig.EvaluationInterval), rule, groupName) - - if err != nil { - tx.Rollback() - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - } - - err = tx.Commit() - if err != nil { - zap.S().Errorf("Error in committing transaction for UPDATE to rules\n", err) - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - return nil -} - -func (r *ClickHouseReader) DeleteRule(id string) *model.ApiError { - - idInt, _ := strconv.Atoi(id) - - tx, err := r.localDB.Begin() - if err != nil { - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - { - stmt, err := tx.Prepare(`DELETE FROM rules WHERE id=$1;`) - - if err != nil { - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - defer stmt.Close() - - if _, err := stmt.Exec(idInt); err != nil { - zap.S().Errorf("Error in Executing prepared statement for DELETE to rules\n", err) - tx.Rollback() // return an error too, we may want to wrap them - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - groupName := fmt.Sprintf("%d-groupname", idInt) - - rule := "" // dummy rule to pass to function - // err = r.ruleManager.UpdateGroupWithAction(time.Duration(r.promConfig.GlobalConfig.EvaluationInterval), rule, groupName, "delete") - err = r.ruleManager.DeleteGroup(time.Duration(r.promConfig.GlobalConfig.EvaluationInterval), rule, groupName) - - if err != nil { - tx.Rollback() - zap.S().Errorf("Error in deleting rule from rulemanager...\n", err) - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - } - - err = tx.Commit() - if err != nil { - zap.S().Errorf("Error in committing transaction for deleting rules\n", err) - return &model.ApiError{Typ: model.ErrorInternal, Err: err} - } - - return nil -} - func (r *ClickHouseReader) GetInstantQueryMetricsResult(ctx context.Context, queryParams *model.InstantQueryMetricsParams) (*promql.Result, *stats.QueryStats, *model.ApiError) { qry, err := r.queryEngine.NewInstantQuery(r.remoteStorage, queryParams.Query, queryParams.Time) if err != nil { @@ -1232,8 +743,8 @@ func (r *ClickHouseReader) GetServices(ctx context.Context, queryParams *model.G serviceItems[i].Num4XX = val } serviceItems[i].CallRate = float64(serviceItems[i].NumCalls) / float64(queryParams.Period) - serviceItems[i].FourXXRate = float64(serviceItems[i].Num4XX) / float64(queryParams.Period) - serviceItems[i].ErrorRate = float64(serviceItems[i].NumErrors) / float64(queryParams.Period) + serviceItems[i].FourXXRate = float64(serviceItems[i].Num4XX) * 100 / float64(serviceItems[i].NumCalls) + serviceItems[i].ErrorRate = float64(serviceItems[i].NumErrors) * 100 / float64(serviceItems[i].NumCalls) } return &serviceItems, nil @@ -1358,6 +869,12 @@ func (r *ClickHouseReader) GetSpanFilters(ctx context.Context, queryParams *mode if len(queryParams.Operation) > 0 { args = buildFilterArrayQuery(ctx, excludeMap, queryParams.Operation, constants.OperationDB, &query, args) } + if len(queryParams.RPCMethod) > 0 { + args = buildFilterArrayQuery(ctx, excludeMap, queryParams.RPCMethod, constants.RPCMethod, &query, args) + } + if len(queryParams.ResponseStatusCode) > 0 { + args = buildFilterArrayQuery(ctx, excludeMap, queryParams.ResponseStatusCode, constants.ResponseStatusCode, &query, args) + } if len(queryParams.MinDuration) != 0 { query = query + " AND durationNano >= @durationNanoMin" @@ -1371,16 +888,18 @@ func (r *ClickHouseReader) GetSpanFilters(ctx context.Context, queryParams *mode query = getStatusFilters(query, queryParams.Status, excludeMap) traceFilterReponse := model.SpanFiltersResponse{ - Status: map[string]uint64{}, - Duration: map[string]uint64{}, - ServiceName: map[string]uint64{}, - Operation: map[string]uint64{}, - HttpCode: map[string]uint64{}, - HttpMethod: map[string]uint64{}, - HttpUrl: map[string]uint64{}, - HttpRoute: map[string]uint64{}, - HttpHost: map[string]uint64{}, - Component: map[string]uint64{}, + Status: map[string]uint64{}, + Duration: map[string]uint64{}, + ServiceName: map[string]uint64{}, + Operation: map[string]uint64{}, + ResponseStatusCode: map[string]uint64{}, + RPCMethod: map[string]uint64{}, + HttpCode: map[string]uint64{}, + HttpMethod: map[string]uint64{}, + HttpUrl: map[string]uint64{}, + HttpRoute: map[string]uint64{}, + HttpHost: map[string]uint64{}, + Component: map[string]uint64{}, } for _, e := range queryParams.GetFilters { @@ -1581,6 +1100,42 @@ func (r *ClickHouseReader) GetSpanFilters(ctx context.Context, queryParams *mode if len(dBResponse2) > 0 { traceFilterReponse.Duration["maxDuration"] = dBResponse2[0].NumTotal } + case constants.RPCMethod: + finalQuery := fmt.Sprintf("SELECT rpcMethod, count() as count FROM %s.%s WHERE timestamp >= @timestampL AND timestamp <= @timestampU", r.traceDB, r.indexTable) + finalQuery += query + finalQuery += " GROUP BY rpcMethod" + var dBResponse []model.DBResponseRPCMethod + err := r.db.Select(ctx, &dBResponse, finalQuery, args...) + zap.S().Info(finalQuery) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return nil, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("error in processing sql query: %s", err)} + } + for _, service := range dBResponse { + if service.RPCMethod != "" { + traceFilterReponse.RPCMethod[service.RPCMethod] = service.Count + } + } + + case constants.ResponseStatusCode: + finalQuery := fmt.Sprintf("SELECT responseStatusCode, count() as count FROM %s.%s WHERE timestamp >= @timestampL AND timestamp <= @timestampU", r.traceDB, r.indexTable) + finalQuery += query + finalQuery += " GROUP BY responseStatusCode" + var dBResponse []model.DBResponseStatusCodeMethod + err := r.db.Select(ctx, &dBResponse, finalQuery, args...) + zap.S().Info(finalQuery) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return nil, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("error in processing sql query: %s", err)} + } + for _, service := range dBResponse { + if service.ResponseStatusCode != "" { + traceFilterReponse.ResponseStatusCode[service.ResponseStatusCode] = service.Count + } + } + default: return nil, &model.ApiError{Typ: model.ErrorBadData, Err: fmt.Errorf("filter type: %s not supported", e)} } @@ -1649,6 +1204,14 @@ func (r *ClickHouseReader) GetFilteredSpans(ctx context.Context, queryParams *mo if len(queryParams.Operation) > 0 { args = buildFilterArrayQuery(ctx, excludeMap, queryParams.Operation, constants.OperationDB, &query, args) } + if len(queryParams.RPCMethod) > 0 { + args = buildFilterArrayQuery(ctx, excludeMap, queryParams.RPCMethod, constants.RPCMethod, &query, args) + } + + if len(queryParams.ResponseStatusCode) > 0 { + args = buildFilterArrayQuery(ctx, excludeMap, queryParams.ResponseStatusCode, constants.ResponseStatusCode, &query, args) + } + if len(queryParams.MinDuration) != 0 { query = query + " AND durationNano >= @durationNanoMin" args = append(args, clickhouse.Named("durationNanoMin", queryParams.MinDuration)) @@ -1708,17 +1271,17 @@ func (r *ClickHouseReader) GetFilteredSpans(ctx context.Context, queryParams *mo var getFilterSpansResponseItems []model.GetFilterSpansResponseItem - baseQuery := fmt.Sprintf("SELECT timestamp, spanID, traceID, serviceName, name, durationNano, httpCode, gRPCCode, gRPCMethod, httpMethod FROM %s WHERE timestamp >= @timestampL AND timestamp <= @timestampU", queryTable) + baseQuery := fmt.Sprintf("SELECT timestamp, spanID, traceID, serviceName, name, durationNano, httpCode, gRPCCode, gRPCMethod, httpMethod, rpcMethod, responseStatusCode FROM %s WHERE timestamp >= @timestampL AND timestamp <= @timestampU", queryTable) baseQuery += query err := r.db.Select(ctx, &getFilterSpansResponseItems, baseQuery, args...) // Fill status and method for i, e := range getFilterSpansResponseItems { - if e.HttpCode == "" { + if e.GRPCode != "" { getFilterSpansResponseItems[i].StatusCode = e.GRPCode } else { getFilterSpansResponseItems[i].StatusCode = e.HttpCode } - if e.HttpMethod == "" { + if e.GRPMethod != "" { getFilterSpansResponseItems[i].Method = e.GRPMethod } else { getFilterSpansResponseItems[i].Method = e.HttpMethod @@ -1831,6 +1394,12 @@ func (r *ClickHouseReader) GetTagFilters(ctx context.Context, queryParams *model if len(queryParams.Operation) > 0 { args = buildFilterArrayQuery(ctx, excludeMap, queryParams.Operation, constants.OperationDB, &query, args) } + if len(queryParams.RPCMethod) > 0 { + args = buildFilterArrayQuery(ctx, excludeMap, queryParams.RPCMethod, constants.RPCMethod, &query, args) + } + if len(queryParams.ResponseStatusCode) > 0 { + args = buildFilterArrayQuery(ctx, excludeMap, queryParams.ResponseStatusCode, constants.ResponseStatusCode, &query, args) + } if len(queryParams.MinDuration) != 0 { query = query + " AND durationNano >= @durationNanoMin" args = append(args, clickhouse.Named("durationNanoMin", queryParams.MinDuration)) @@ -2168,6 +1737,11 @@ func (r *ClickHouseReader) GetFilteredSpansAggregates(ctx context.Context, query query = fmt.Sprintf("SELECT toStartOfInterval(timestamp, INTERVAL %d minute) as time, dbSystem as groupBy, %s FROM %s.%s WHERE timestamp >= @timestampL AND timestamp <= @timestampU", queryParams.StepSeconds/60, aggregation_query, r.traceDB, r.indexTable) case constants.Component: query = fmt.Sprintf("SELECT toStartOfInterval(timestamp, INTERVAL %d minute) as time, component as groupBy, %s FROM %s.%s WHERE timestamp >= @timestampL AND timestamp <= @timestampU", queryParams.StepSeconds/60, aggregation_query, r.traceDB, r.indexTable) + case constants.RPCMethod: + query = fmt.Sprintf("SELECT toStartOfInterval(timestamp, INTERVAL %d minute) as time, rpcMethod as groupBy, %s FROM %s.%s WHERE timestamp >= @timestampL AND timestamp <= @timestampU", queryParams.StepSeconds/60, aggregation_query, r.traceDB, r.indexTable) + case constants.ResponseStatusCode: + query = fmt.Sprintf("SELECT toStartOfInterval(timestamp, INTERVAL %d minute) as time, responseStatusCode as groupBy, %s FROM %s.%s WHERE timestamp >= @timestampL AND timestamp <= @timestampU", queryParams.StepSeconds/60, aggregation_query, r.traceDB, r.indexTable) + default: return nil, &model.ApiError{Typ: model.ErrorBadData, Err: fmt.Errorf("groupBy type: %s not supported", queryParams.GroupBy)} } @@ -2199,6 +1773,12 @@ func (r *ClickHouseReader) GetFilteredSpansAggregates(ctx context.Context, query if len(queryParams.Operation) > 0 { args = buildFilterArrayQuery(ctx, excludeMap, queryParams.Operation, constants.OperationDB, &query, args) } + if len(queryParams.RPCMethod) > 0 { + args = buildFilterArrayQuery(ctx, excludeMap, queryParams.RPCMethod, constants.RPCMethod, &query, args) + } + if len(queryParams.ResponseStatusCode) > 0 { + args = buildFilterArrayQuery(ctx, excludeMap, queryParams.ResponseStatusCode, constants.ResponseStatusCode, &query, args) + } if len(queryParams.MinDuration) != 0 { query = query + " AND durationNano >= @durationNanoMin" args = append(args, clickhouse.Named("durationNanoMin", queryParams.MinDuration)) @@ -2247,6 +1827,11 @@ func (r *ClickHouseReader) GetFilteredSpansAggregates(ctx context.Context, query query = query + " GROUP BY time, dbSystem as groupBy ORDER BY time" case constants.Component: query = query + " GROUP BY time, component as groupBy ORDER BY time" + case constants.RPCMethod: + query = query + " GROUP BY time, rpcMethod as groupBy ORDER BY time" + case constants.ResponseStatusCode: + query = query + " GROUP BY time, responseStatusCode as groupBy ORDER BY time" + default: return nil, &model.ApiError{Typ: model.ErrorBadData, Err: fmt.Errorf("groupBy type: %s not supported", queryParams.GroupBy)} } @@ -2643,15 +2228,30 @@ func (r *ClickHouseReader) GetTTL(ctx context.Context, ttlParams *model.GetTTLPa } -func (r *ClickHouseReader) GetErrors(ctx context.Context, queryParams *model.GetErrorsParams) (*[]model.Error, *model.ApiError) { +func (r *ClickHouseReader) ListErrors(ctx context.Context, queryParams *model.ListErrorsParams) (*[]model.Error, *model.ApiError) { - var getErrorReponses []model.Error + var getErrorResponses []model.Error - query := fmt.Sprintf("SELECT exceptionType, exceptionMessage, count() AS exceptionCount, min(timestamp) as firstSeen, max(timestamp) as lastSeen, serviceName FROM %s.%s WHERE timestamp >= @timestampL AND timestamp <= @timestampU GROUP BY serviceName, exceptionType, exceptionMessage", r.traceDB, r.errorTable) + query := fmt.Sprintf("SELECT any(exceptionType) as exceptionType, any(exceptionMessage) as exceptionMessage, count() AS exceptionCount, min(timestamp) as firstSeen, max(timestamp) as lastSeen, any(serviceName) as serviceName, groupID FROM %s.%s WHERE timestamp >= @timestampL AND timestamp <= @timestampU GROUP BY groupID", r.traceDB, r.errorTable) args := []interface{}{clickhouse.Named("timestampL", strconv.FormatInt(queryParams.Start.UnixNano(), 10)), clickhouse.Named("timestampU", strconv.FormatInt(queryParams.End.UnixNano(), 10))} + if len(queryParams.OrderParam) != 0 { + if queryParams.Order == constants.Descending { + query = query + " ORDER BY " + queryParams.OrderParam + " DESC" + } else if queryParams.Order == constants.Ascending { + query = query + " ORDER BY " + queryParams.OrderParam + " ASC" + } + } + if queryParams.Limit > 0 { + query = query + " LIMIT @limit" + args = append(args, clickhouse.Named("limit", queryParams.Limit)) + } - err := r.db.Select(ctx, &getErrorReponses, query, args...) + if queryParams.Offset > 0 { + query = query + " OFFSET @offset" + args = append(args, clickhouse.Named("offset", queryParams.Offset)) + } + err := r.db.Select(ctx, &getErrorResponses, query, args...) zap.S().Info(query) if err != nil { @@ -2659,30 +2259,41 @@ func (r *ClickHouseReader) GetErrors(ctx context.Context, queryParams *model.Get return nil, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("Error in processing sql query")} } - return &getErrorReponses, nil - + return &getErrorResponses, nil } -func (r *ClickHouseReader) GetErrorForId(ctx context.Context, queryParams *model.GetErrorParams) (*model.ErrorWithSpan, *model.ApiError) { +func (r *ClickHouseReader) CountErrors(ctx context.Context, queryParams *model.CountErrorsParams) (uint64, *model.ApiError) { + + var errorCount uint64 + + query := fmt.Sprintf("SELECT count(distinct(groupID)) FROM %s.%s WHERE timestamp >= @timestampL AND timestamp <= @timestampU", r.traceDB, r.errorTable) + args := []interface{}{clickhouse.Named("timestampL", strconv.FormatInt(queryParams.Start.UnixNano(), 10)), clickhouse.Named("timestampU", strconv.FormatInt(queryParams.End.UnixNano(), 10))} + + err := r.db.QueryRow(ctx, query, args...).Scan(&errorCount) + zap.S().Info(query) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return 0, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("Error in processing sql query")} + } + + return errorCount, nil +} + +func (r *ClickHouseReader) GetErrorFromErrorID(ctx context.Context, queryParams *model.GetErrorParams) (*model.ErrorWithSpan, *model.ApiError) { if queryParams.ErrorID == "" { zap.S().Debug("errorId missing from params") - return nil, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("ErrorID missing from params")} + return nil, &model.ApiError{Typ: model.ErrorBadData, Err: fmt.Errorf("ErrorID missing from params")} } var getErrorWithSpanReponse []model.ErrorWithSpan - // TODO: Optimize this query further - query := fmt.Sprintf("SELECT spanID, traceID, errorID, timestamp, serviceName, exceptionType, exceptionMessage, exceptionStacktrace, exceptionEscaped, olderErrorId, newerErrorId FROM (SELECT *, lagInFrame(toNullable(errorID)) over w as olderErrorId, leadInFrame(toNullable(errorID)) over w as newerErrorId FROM %s.%s window w as (ORDER BY exceptionType, serviceName, timestamp rows between unbounded preceding and unbounded following)) WHERE errorID = @errorID", r.traceDB, r.errorTable) - args := []interface{}{clickhouse.Named("errorID", queryParams.ErrorID)} + query := fmt.Sprintf("SELECT * FROM %s.%s WHERE timestamp = @timestamp AND groupID = @groupID AND errorID = @errorID LIMIT 1", r.traceDB, r.errorTable) + args := []interface{}{clickhouse.Named("errorID", queryParams.ErrorID), clickhouse.Named("groupID", queryParams.GroupID), clickhouse.Named("timestamp", strconv.FormatInt(queryParams.Timestamp.UnixNano(), 10))} err := r.db.Select(ctx, &getErrorWithSpanReponse, query, args...) - zap.S().Info(query) - if err == sql.ErrNoRows { - return nil, nil - } - if err != nil { zap.S().Debug("Error in processing sql query: ", err) return nil, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("Error in processing sql query")} @@ -2691,22 +2302,17 @@ func (r *ClickHouseReader) GetErrorForId(ctx context.Context, queryParams *model if len(getErrorWithSpanReponse) > 0 { return &getErrorWithSpanReponse[0], nil } else { - return &model.ErrorWithSpan{}, &model.ApiError{Typ: model.ErrorNotFound, Err: fmt.Errorf("Error ID not found")} + return nil, &model.ApiError{Typ: model.ErrorNotFound, Err: fmt.Errorf("Error/Exception not found")} } } -func (r *ClickHouseReader) GetErrorForType(ctx context.Context, queryParams *model.GetErrorParams) (*model.ErrorWithSpan, *model.ApiError) { +func (r *ClickHouseReader) GetErrorFromGroupID(ctx context.Context, queryParams *model.GetErrorParams) (*model.ErrorWithSpan, *model.ApiError) { - if queryParams.ErrorType == "" || queryParams.ServiceName == "" { - zap.S().Debug("errorType/serviceName missing from params") - return nil, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("ErrorType/serviceName missing from params")} - } var getErrorWithSpanReponse []model.ErrorWithSpan - // TODO: Optimize this query further - query := fmt.Sprintf("SELECT spanID, traceID, errorID, timestamp , serviceName, exceptionType, exceptionMessage, exceptionStacktrace, exceptionEscaped, newerErrorId, olderErrorId FROM (SELECT *, lagInFrame(errorID) over w as olderErrorId, leadInFrame(errorID) over w as newerErrorId FROM %s.%s WHERE serviceName = @serviceName AND exceptionType = @errorType window w as (ORDER BY timestamp DESC rows between unbounded preceding and unbounded following))", r.traceDB, r.errorTable) - args := []interface{}{clickhouse.Named("serviceName", queryParams.ServiceName), clickhouse.Named("errorType", queryParams.ErrorType)} + query := fmt.Sprintf("SELECT * FROM %s.%s WHERE timestamp = @timestamp AND groupID = @groupID LIMIT 1", r.traceDB, r.errorTable) + args := []interface{}{clickhouse.Named("groupID", queryParams.GroupID), clickhouse.Named("timestamp", strconv.FormatInt(queryParams.Timestamp.UnixNano(), 10))} err := r.db.Select(ctx, &getErrorWithSpanReponse, query, args...) @@ -2720,11 +2326,173 @@ func (r *ClickHouseReader) GetErrorForType(ctx context.Context, queryParams *mod if len(getErrorWithSpanReponse) > 0 { return &getErrorWithSpanReponse[0], nil } else { - return nil, &model.ApiError{Typ: model.ErrorUnavailable, Err: fmt.Errorf("Error/Exception not found")} + return nil, &model.ApiError{Typ: model.ErrorNotFound, Err: fmt.Errorf("Error/Exception not found")} } } +func (r *ClickHouseReader) GetNextPrevErrorIDs(ctx context.Context, queryParams *model.GetErrorParams) (*model.NextPrevErrorIDs, *model.ApiError) { + + if queryParams.ErrorID == "" { + zap.S().Debug("errorId missing from params") + return nil, &model.ApiError{Typ: model.ErrorBadData, Err: fmt.Errorf("ErrorID missing from params")} + } + var err *model.ApiError + getNextPrevErrorIDsResponse := model.NextPrevErrorIDs{ + GroupID: queryParams.GroupID, + } + getNextPrevErrorIDsResponse.NextErrorID, getNextPrevErrorIDsResponse.NextTimestamp, err = r.getNextErrorID(ctx, queryParams) + if err != nil { + zap.S().Debug("Unable to get next error ID due to err: ", err) + return nil, err + } + getNextPrevErrorIDsResponse.PrevErrorID, getNextPrevErrorIDsResponse.PrevTimestamp, err = r.getPrevErrorID(ctx, queryParams) + if err != nil { + zap.S().Debug("Unable to get prev error ID due to err: ", err) + return nil, err + } + return &getNextPrevErrorIDsResponse, nil + +} + +func (r *ClickHouseReader) getNextErrorID(ctx context.Context, queryParams *model.GetErrorParams) (string, time.Time, *model.ApiError) { + + var getNextErrorIDReponse []model.NextPrevErrorIDsDBResponse + + query := fmt.Sprintf("SELECT errorID as nextErrorID, timestamp as nextTimestamp FROM %s.%s WHERE groupID = @groupID AND timestamp >= @timestamp AND errorID != @errorID ORDER BY timestamp ASC LIMIT 2", r.traceDB, r.errorTable) + args := []interface{}{clickhouse.Named("errorID", queryParams.ErrorID), clickhouse.Named("groupID", queryParams.GroupID), clickhouse.Named("timestamp", strconv.FormatInt(queryParams.Timestamp.UnixNano(), 10))} + + err := r.db.Select(ctx, &getNextErrorIDReponse, query, args...) + + zap.S().Info(query) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return "", time.Time{}, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("Error in processing sql query")} + } + if len(getNextErrorIDReponse) == 0 { + zap.S().Info("NextErrorID not found") + return "", time.Time{}, nil + } else if len(getNextErrorIDReponse) == 1 { + zap.S().Info("NextErrorID found") + return getNextErrorIDReponse[0].NextErrorID, getNextErrorIDReponse[0].NextTimestamp, nil + } else { + if getNextErrorIDReponse[0].Timestamp.UnixNano() == getNextErrorIDReponse[1].Timestamp.UnixNano() { + var getNextErrorIDReponse []model.NextPrevErrorIDsDBResponse + + query := fmt.Sprintf("SELECT errorID as nextErrorID, timestamp as nextTimestamp FROM %s.%s WHERE groupID = @groupID AND timestamp = @timestamp AND errorID > @errorID ORDER BY errorID ASC LIMIT 1", r.traceDB, r.errorTable) + args := []interface{}{clickhouse.Named("errorID", queryParams.ErrorID), clickhouse.Named("groupID", queryParams.GroupID), clickhouse.Named("timestamp", strconv.FormatInt(queryParams.Timestamp.UnixNano(), 10))} + + err := r.db.Select(ctx, &getNextErrorIDReponse, query, args...) + + zap.S().Info(query) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return "", time.Time{}, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("Error in processing sql query")} + } + if len(getNextErrorIDReponse) == 0 { + var getNextErrorIDReponse []model.NextPrevErrorIDsDBResponse + + query := fmt.Sprintf("SELECT errorID as nextErrorID, timestamp as nextTimestamp FROM %s.%s WHERE groupID = @groupID AND timestamp > @timestamp ORDER BY timestamp ASC LIMIT 1", r.traceDB, r.errorTable) + args := []interface{}{clickhouse.Named("errorID", queryParams.ErrorID), clickhouse.Named("groupID", queryParams.GroupID), clickhouse.Named("timestamp", strconv.FormatInt(queryParams.Timestamp.UnixNano(), 10))} + + err := r.db.Select(ctx, &getNextErrorIDReponse, query, args...) + + zap.S().Info(query) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return "", time.Time{}, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("Error in processing sql query")} + } + + if len(getNextErrorIDReponse) == 0 { + zap.S().Info("NextErrorID not found") + return "", time.Time{}, nil + } else { + zap.S().Info("NextErrorID found") + return getNextErrorIDReponse[0].NextErrorID, getNextErrorIDReponse[0].NextTimestamp, nil + } + } else { + zap.S().Info("NextErrorID found") + return getNextErrorIDReponse[0].NextErrorID, getNextErrorIDReponse[0].NextTimestamp, nil + } + } else { + zap.S().Info("NextErrorID found") + return getNextErrorIDReponse[0].NextErrorID, getNextErrorIDReponse[0].NextTimestamp, nil + } + } +} + +func (r *ClickHouseReader) getPrevErrorID(ctx context.Context, queryParams *model.GetErrorParams) (string, time.Time, *model.ApiError) { + + var getPrevErrorIDReponse []model.NextPrevErrorIDsDBResponse + + query := fmt.Sprintf("SELECT errorID as prevErrorID, timestamp as prevTimestamp FROM %s.%s WHERE groupID = @groupID AND timestamp <= @timestamp AND errorID != @errorID ORDER BY timestamp DESC LIMIT 2", r.traceDB, r.errorTable) + args := []interface{}{clickhouse.Named("errorID", queryParams.ErrorID), clickhouse.Named("groupID", queryParams.GroupID), clickhouse.Named("timestamp", strconv.FormatInt(queryParams.Timestamp.UnixNano(), 10))} + + err := r.db.Select(ctx, &getPrevErrorIDReponse, query, args...) + + zap.S().Info(query) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return "", time.Time{}, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("Error in processing sql query")} + } + if len(getPrevErrorIDReponse) == 0 { + zap.S().Info("PrevErrorID not found") + return "", time.Time{}, nil + } else if len(getPrevErrorIDReponse) == 1 { + zap.S().Info("PrevErrorID found") + return getPrevErrorIDReponse[0].PrevErrorID, getPrevErrorIDReponse[0].PrevTimestamp, nil + } else { + if getPrevErrorIDReponse[0].Timestamp.UnixNano() == getPrevErrorIDReponse[1].Timestamp.UnixNano() { + var getPrevErrorIDReponse []model.NextPrevErrorIDsDBResponse + + query := fmt.Sprintf("SELECT errorID as prevErrorID, timestamp as prevTimestamp FROM %s.%s WHERE groupID = @groupID AND timestamp = @timestamp AND errorID < @errorID ORDER BY errorID DESC LIMIT 1", r.traceDB, r.errorTable) + args := []interface{}{clickhouse.Named("errorID", queryParams.ErrorID), clickhouse.Named("groupID", queryParams.GroupID), clickhouse.Named("timestamp", strconv.FormatInt(queryParams.Timestamp.UnixNano(), 10))} + + err := r.db.Select(ctx, &getPrevErrorIDReponse, query, args...) + + zap.S().Info(query) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return "", time.Time{}, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("Error in processing sql query")} + } + if len(getPrevErrorIDReponse) == 0 { + var getPrevErrorIDReponse []model.NextPrevErrorIDsDBResponse + + query := fmt.Sprintf("SELECT errorID as prevErrorID, timestamp as prevTimestamp FROM %s.%s WHERE groupID = @groupID AND timestamp < @timestamp ORDER BY timestamp DESC LIMIT 1", r.traceDB, r.errorTable) + args := []interface{}{clickhouse.Named("errorID", queryParams.ErrorID), clickhouse.Named("groupID", queryParams.GroupID), clickhouse.Named("timestamp", strconv.FormatInt(queryParams.Timestamp.UnixNano(), 10))} + + err := r.db.Select(ctx, &getPrevErrorIDReponse, query, args...) + + zap.S().Info(query) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return "", time.Time{}, &model.ApiError{Typ: model.ErrorExec, Err: fmt.Errorf("Error in processing sql query")} + } + + if len(getPrevErrorIDReponse) == 0 { + zap.S().Info("PrevErrorID not found") + return "", time.Time{}, nil + } else { + zap.S().Info("PrevErrorID found") + return getPrevErrorIDReponse[0].PrevErrorID, getPrevErrorIDReponse[0].PrevTimestamp, nil + } + } else { + zap.S().Info("PrevErrorID found") + return getPrevErrorIDReponse[0].PrevErrorID, getPrevErrorIDReponse[0].PrevTimestamp, nil + } + } else { + zap.S().Info("PrevErrorID found") + return getPrevErrorIDReponse[0].PrevErrorID, getPrevErrorIDReponse[0].PrevTimestamp, nil + } + } +} + func (r *ClickHouseReader) GetMetricAutocompleteTagKey(ctx context.Context, params *model.MetricAutocompleteTagParams) (*[]string, *model.ApiError) { var query string diff --git a/pkg/query-service/app/http_handler.go b/pkg/query-service/app/http_handler.go index 92caf4ed69..8e5233d744 100644 --- a/pkg/query-service/app/http_handler.go +++ b/pkg/query-service/app/http_handler.go @@ -22,10 +22,12 @@ import ( "go.signoz.io/query-service/app/parser" "go.signoz.io/query-service/auth" "go.signoz.io/query-service/constants" + "go.signoz.io/query-service/dao" am "go.signoz.io/query-service/integrations/alertManager" "go.signoz.io/query-service/interfaces" "go.signoz.io/query-service/model" + "go.signoz.io/query-service/rules" "go.signoz.io/query-service/telemetry" "go.signoz.io/query-service/version" "go.uber.org/zap" @@ -52,17 +54,22 @@ type APIHandler struct { reader *interfaces.Reader relationalDB dao.ModelDao alertManager am.Manager + ruleManager *rules.Manager ready func(http.HandlerFunc) http.HandlerFunc } // NewAPIHandler returns an APIHandler -func NewAPIHandler(reader *interfaces.Reader, relationalDB dao.ModelDao) (*APIHandler, error) { +func NewAPIHandler(reader *interfaces.Reader, relationalDB dao.ModelDao, ruleManager *rules.Manager) (*APIHandler, error) { - alertManager := am.New("") + alertManager, err := am.New("") + if err != nil { + return nil, err + } aH := &APIHandler{ reader: reader, relationalDB: relationalDB, alertManager: alertManager, + ruleManager: ruleManager, } aH.ready = aH.testReady @@ -299,7 +306,7 @@ func (aH *APIHandler) RegisterRoutes(router *mux.Router) { router.HandleFunc("/api/v1/channels/{id}", AdminAccess(aH.deleteChannel)).Methods(http.MethodDelete) router.HandleFunc("/api/v1/channels", EditAccess(aH.createChannel)).Methods(http.MethodPost) router.HandleFunc("/api/v1/testChannel", EditAccess(aH.testChannel)).Methods(http.MethodPost) - router.HandleFunc("/api/v1/rules", ViewAccess(aH.listRulesFromProm)).Methods(http.MethodGet) + router.HandleFunc("/api/v1/rules", ViewAccess(aH.listRules)).Methods(http.MethodGet) router.HandleFunc("/api/v1/rules/{id}", ViewAccess(aH.getRule)).Methods(http.MethodGet) router.HandleFunc("/api/v1/rules", EditAccess(aH.createRule)).Methods(http.MethodPost) router.HandleFunc("/api/v1/rules/{id}", EditAccess(aH.editRule)).Methods(http.MethodPut) @@ -329,11 +336,13 @@ func (aH *APIHandler) RegisterRoutes(router *mux.Router) { router.HandleFunc("/api/v1/getTagFilters", ViewAccess(aH.getTagFilters)).Methods(http.MethodPost) router.HandleFunc("/api/v1/getFilteredSpans", ViewAccess(aH.getFilteredSpans)).Methods(http.MethodPost) router.HandleFunc("/api/v1/getFilteredSpans/aggregates", ViewAccess(aH.getFilteredSpanAggregates)).Methods(http.MethodPost) - router.HandleFunc("/api/v1/getTagValues", ViewAccess(aH.getTagValues)).Methods(http.MethodPost) - router.HandleFunc("/api/v1/errors", ViewAccess(aH.getErrors)).Methods(http.MethodGet) - router.HandleFunc("/api/v1/errorWithId", ViewAccess(aH.getErrorForId)).Methods(http.MethodGet) - router.HandleFunc("/api/v1/errorWithType", ViewAccess(aH.getErrorForType)).Methods(http.MethodGet) + + router.HandleFunc("/api/v1/listErrors", ViewAccess(aH.listErrors)).Methods(http.MethodGet) + router.HandleFunc("/api/v1/countErrors", ViewAccess(aH.countErrors)).Methods(http.MethodGet) + router.HandleFunc("/api/v1/errorFromErrorID", ViewAccess(aH.getErrorFromErrorID)).Methods(http.MethodGet) + router.HandleFunc("/api/v1/errorFromGroupID", ViewAccess(aH.getErrorFromGroupID)).Methods(http.MethodGet) + router.HandleFunc("/api/v1/nextPrevErrorIDs", ViewAccess(aH.getNextPrevErrorIDs)).Methods(http.MethodGet) router.HandleFunc("/api/v1/disks", ViewAccess(aH.getDisks)).Methods(http.MethodGet) @@ -381,12 +390,12 @@ func Intersection(a, b []int) (c []int) { func (aH *APIHandler) getRule(w http.ResponseWriter, r *http.Request) { id := mux.Vars(r)["id"] - alertList, apiErrorObj := (*aH.reader).GetRule(id) - if apiErrorObj != nil { - respondError(w, apiErrorObj, nil) + ruleResponse, err := aH.ruleManager.GetRule(id) + if err != nil { + respondError(w, &model.ApiError{Typ: model.ErrorInternal, Err: err}, nil) return } - aH.respond(w, alertList) + aH.respond(w, ruleResponse) } func (aH *APIHandler) metricAutocompleteMetricName(w http.ResponseWriter, r *http.Request) { @@ -617,13 +626,17 @@ func (aH *APIHandler) queryRangeMetricsV2(w http.ResponseWriter, r *http.Request aH.respond(w, resp) } -func (aH *APIHandler) listRulesFromProm(w http.ResponseWriter, r *http.Request) { - alertList, apiErrorObj := (*aH.reader).ListRulesFromProm() - if apiErrorObj != nil { - respondError(w, apiErrorObj, nil) +func (aH *APIHandler) listRules(w http.ResponseWriter, r *http.Request) { + + rules, err := aH.ruleManager.ListRuleStates() + if err != nil { + respondError(w, &model.ApiError{Typ: model.ErrorInternal, Err: err}, nil) return } - aH.respond(w, alertList) + + // todo(amol): need to add sorter + + aH.respond(w, rules) } func (aH *APIHandler) getDashboards(w http.ResponseWriter, r *http.Request) { @@ -759,32 +772,35 @@ func (aH *APIHandler) createDashboards(w http.ResponseWriter, r *http.Request) { } func (aH *APIHandler) deleteRule(w http.ResponseWriter, r *http.Request) { + id := mux.Vars(r)["id"] - apiErrorObj := (*aH.reader).DeleteRule(id) + err := aH.ruleManager.DeleteRule(id) - if apiErrorObj != nil { - respondError(w, apiErrorObj, nil) + if err != nil { + respondError(w, &model.ApiError{Typ: model.ErrorInternal, Err: err}, nil) return } aH.respond(w, "rule successfully deleted") } + func (aH *APIHandler) editRule(w http.ResponseWriter, r *http.Request) { id := mux.Vars(r)["id"] - var postData map[string]string - err := json.NewDecoder(r.Body).Decode(&postData) + defer r.Body.Close() + body, err := ioutil.ReadAll(r.Body) if err != nil { - respondError(w, &model.ApiError{Typ: model.ErrorBadData, Err: err}, "Error reading request body") + zap.S().Errorf("msg: error in getting req body of edit rule API\n", "\t error:", err) + respondError(w, &model.ApiError{Typ: model.ErrorBadData, Err: err}, nil) return } - apiErrorObj := (*aH.reader).EditRule(postData["data"], id) + err = aH.ruleManager.EditRule(string(body), id) - if apiErrorObj != nil { - respondError(w, apiErrorObj, nil) + if err != nil { + respondError(w, &model.ApiError{Typ: model.ErrorInternal, Err: err}, nil) return } @@ -908,20 +924,17 @@ func (aH *APIHandler) createChannel(w http.ResponseWriter, r *http.Request) { func (aH *APIHandler) createRule(w http.ResponseWriter, r *http.Request) { - decoder := json.NewDecoder(r.Body) - - var postData map[string]string - err := decoder.Decode(&postData) - + defer r.Body.Close() + body, err := ioutil.ReadAll(r.Body) if err != nil { + zap.S().Errorf("Error in getting req body for create rule API\n", err) respondError(w, &model.ApiError{Typ: model.ErrorBadData, Err: err}, nil) return } - apiErrorObj := (*aH.reader).CreateRule(postData["data"]) - - if apiErrorObj != nil { - respondError(w, apiErrorObj, nil) + err = aH.ruleManager.CreateRule(string(body)) + if err != nil { + respondError(w, &model.ApiError{Typ: model.ErrorBadData, Err: err}, nil) return } @@ -1179,49 +1192,78 @@ func (aH *APIHandler) searchTraces(w http.ResponseWriter, r *http.Request) { } -func (aH *APIHandler) getErrors(w http.ResponseWriter, r *http.Request) { +func (aH *APIHandler) listErrors(w http.ResponseWriter, r *http.Request) { - query, err := parseErrorsRequest(r) + query, err := parseListErrorsRequest(r) if aH.handleError(w, err, http.StatusBadRequest) { return } - result, apiErr := (*aH.reader).GetErrors(r.Context(), query) + result, apiErr := (*aH.reader).ListErrors(r.Context(), query) if apiErr != nil && aH.handleError(w, apiErr.Err, http.StatusInternalServerError) { return } aH.writeJSON(w, r, result) - } -func (aH *APIHandler) getErrorForId(w http.ResponseWriter, r *http.Request) { +func (aH *APIHandler) countErrors(w http.ResponseWriter, r *http.Request) { - query, err := parseErrorRequest(r) + query, err := parseCountErrorsRequest(r) if aH.handleError(w, err, http.StatusBadRequest) { return } - result, apiErr := (*aH.reader).GetErrorForId(r.Context(), query) - if apiErr != nil && aH.handleError(w, apiErr.Err, http.StatusInternalServerError) { + result, apiErr := (*aH.reader).CountErrors(r.Context(), query) + if apiErr != nil { + respondError(w, apiErr, nil) return } aH.writeJSON(w, r, result) - } -func (aH *APIHandler) getErrorForType(w http.ResponseWriter, r *http.Request) { +func (aH *APIHandler) getErrorFromErrorID(w http.ResponseWriter, r *http.Request) { - query, err := parseErrorRequest(r) + query, err := parseGetErrorRequest(r) if aH.handleError(w, err, http.StatusBadRequest) { return } - result, apiErr := (*aH.reader).GetErrorForType(r.Context(), query) - if apiErr != nil && aH.handleError(w, apiErr.Err, http.StatusInternalServerError) { + result, apiErr := (*aH.reader).GetErrorFromErrorID(r.Context(), query) + if apiErr != nil { + respondError(w, apiErr, nil) return } aH.writeJSON(w, r, result) +} +func (aH *APIHandler) getNextPrevErrorIDs(w http.ResponseWriter, r *http.Request) { + + query, err := parseGetErrorRequest(r) + if aH.handleError(w, err, http.StatusBadRequest) { + return + } + result, apiErr := (*aH.reader).GetNextPrevErrorIDs(r.Context(), query) + if apiErr != nil { + respondError(w, apiErr, nil) + return + } + + aH.writeJSON(w, r, result) +} + +func (aH *APIHandler) getErrorFromGroupID(w http.ResponseWriter, r *http.Request) { + + query, err := parseGetErrorRequest(r) + if aH.handleError(w, err, http.StatusBadRequest) { + return + } + result, apiErr := (*aH.reader).GetErrorFromGroupID(r.Context(), query) + if apiErr != nil { + respondError(w, apiErr, nil) + return + } + + aH.writeJSON(w, r, result) } func (aH *APIHandler) getSpanFilters(w http.ResponseWriter, r *http.Request) { diff --git a/pkg/query-service/app/parser.go b/pkg/query-service/app/parser.go index 9d3705da9f..e81b986a3d 100644 --- a/pkg/query-service/app/parser.go +++ b/pkg/query-service/app/parser.go @@ -360,28 +360,6 @@ func parseFilteredSpanAggregatesRequest(r *http.Request) (*model.GetFilteredSpan return postData, nil } -func parseErrorRequest(r *http.Request) (*model.GetErrorParams, error) { - - params := &model.GetErrorParams{} - - serviceName := r.URL.Query().Get("serviceName") - if len(serviceName) != 0 { - params.ServiceName = serviceName - } - - errorType := r.URL.Query().Get("errorType") - if len(errorType) != 0 { - params.ErrorType = errorType - } - - errorId := r.URL.Query().Get("errorId") - if len(errorId) != 0 { - params.ErrorID = errorId - } - - return params, nil -} - func parseTagFilterRequest(r *http.Request) (*model.TagFilterParams, error) { var postData *model.TagFilterParams err := json.NewDecoder(r.Body).Decode(&postData) @@ -427,7 +405,10 @@ func parseTagValueRequest(r *http.Request) (*model.TagFilterParams, error) { } -func parseErrorsRequest(r *http.Request) (*model.GetErrorsParams, error) { +func parseListErrorsRequest(r *http.Request) (*model.ListErrorsParams, error) { + + var allowedOrderParams = []string{"exceptionType", "exceptionCount", "firstSeen", "lastSeen", "serviceName"} + var allowedOrderDirections = []string{"ascending", "descending"} startTime, err := parseTime("start", r) if err != nil { @@ -438,9 +419,79 @@ func parseErrorsRequest(r *http.Request) (*model.GetErrorsParams, error) { return nil, err } - params := &model.GetErrorsParams{ - Start: startTime, - End: endTime, + order := r.URL.Query().Get("order") + if len(order) > 0 && !DoesExistInSlice(order, allowedOrderDirections) { + return nil, errors.New(fmt.Sprintf("given order: %s is not allowed in query", order)) + } + orderParam := r.URL.Query().Get("orderParam") + if len(order) > 0 && !DoesExistInSlice(orderParam, allowedOrderParams) { + return nil, errors.New(fmt.Sprintf("given orderParam: %s is not allowed in query", orderParam)) + } + limit := r.URL.Query().Get("limit") + offset := r.URL.Query().Get("offset") + + if len(offset) == 0 || len(limit) == 0 { + return nil, fmt.Errorf("offset or limit param cannot be empty from the query") + } + + limitInt, err := strconv.Atoi(limit) + if err != nil { + return nil, errors.New("limit param is not in correct format") + } + offsetInt, err := strconv.Atoi(offset) + if err != nil { + return nil, errors.New("offset param is not in correct format") + } + + params := &model.ListErrorsParams{ + Start: startTime, + End: endTime, + OrderParam: orderParam, + Order: order, + Limit: int64(limitInt), + Offset: int64(offsetInt), + } + + return params, nil +} + +func parseCountErrorsRequest(r *http.Request) (*model.CountErrorsParams, error) { + + startTime, err := parseTime("start", r) + if err != nil { + return nil, err + } + endTime, err := parseTimeMinusBuffer("end", r) + if err != nil { + return nil, err + } + + params := &model.CountErrorsParams{ + Start: startTime, + End: endTime, + } + + return params, nil +} + +func parseGetErrorRequest(r *http.Request) (*model.GetErrorParams, error) { + + timestamp, err := parseTime("timestamp", r) + if err != nil { + return nil, err + } + + groupID := r.URL.Query().Get("groupID") + + if len(groupID) == 0 { + return nil, fmt.Errorf("groupID param cannot be empty from the query") + } + errorID := r.URL.Query().Get("errorID") + + params := &model.GetErrorParams{ + Timestamp: timestamp, + GroupID: groupID, + ErrorID: errorID, } return params, nil diff --git a/pkg/query-service/app/server.go b/pkg/query-service/app/server.go index ea9a182786..b735f1b657 100644 --- a/pkg/query-service/app/server.go +++ b/pkg/query-service/app/server.go @@ -11,6 +11,7 @@ import ( "github.com/gorilla/handlers" "github.com/gorilla/mux" + "github.com/jmoiron/sqlx" "github.com/rs/cors" "github.com/soheilhy/cmux" @@ -19,15 +20,22 @@ import ( "go.signoz.io/query-service/constants" "go.signoz.io/query-service/dao" "go.signoz.io/query-service/healthcheck" + am "go.signoz.io/query-service/integrations/alertManager" "go.signoz.io/query-service/interfaces" + pqle "go.signoz.io/query-service/pqlEngine" + "go.signoz.io/query-service/rules" "go.signoz.io/query-service/telemetry" "go.signoz.io/query-service/utils" "go.uber.org/zap" ) type ServerOptions struct { + PromConfigPath string HTTPHostPort string PrivateHostPort string + // alert specific params + DisableRules bool + RuleRepoURL string } // Server runs HTTP, Mux and a grpc server @@ -35,6 +43,9 @@ type Server struct { // logger *zap.Logger // tracer opentracing.Tracer // TODO make part of flags.Service serverOptions *ServerOptions + conn net.Listener + ruleManager *rules.Manager + separatePorts bool // public http router httpConn net.Listener @@ -58,6 +69,7 @@ func NewServer(serverOptions *ServerOptions) (*Server, error) { if err := dao.InitDao("sqlite", constants.RELATIONAL_DATASOURCE_PATH); err != nil { return nil, err } + localDB, err := dashboards.InitDB(constants.RELATIONAL_DATASOURCE_PATH) if err != nil { @@ -70,16 +82,20 @@ func NewServer(serverOptions *ServerOptions) (*Server, error) { storage := os.Getenv("STORAGE") if storage == "clickhouse" { zap.S().Info("Using ClickHouse as datastore ...") - clickhouseReader := clickhouseReader.NewReader(localDB) + clickhouseReader := clickhouseReader.NewReader(localDB, serverOptions.PromConfigPath) go clickhouseReader.Start() reader = clickhouseReader } else { return nil, fmt.Errorf("Storage type: %s is not supported in query service", storage) } - telemetry.GetInstance().SetReader(reader) + rm, err := makeRulesManager(serverOptions.PromConfigPath, constants.GetAlertManagerApiPrefix(), serverOptions.RuleRepoURL, localDB, reader, serverOptions.DisableRules) + if err != nil { + return nil, err + } - apiHandler, err := NewAPIHandler(&reader, dao.DB()) + telemetry.GetInstance().SetReader(reader) + apiHandler, err := NewAPIHandler(&reader, dao.DB(), rm) if err != nil { return nil, err } @@ -87,6 +103,7 @@ func NewServer(serverOptions *ServerOptions) (*Server, error) { s := &Server{ // logger: logger, // tracer: tracer, + ruleManager: rm, serverOptions: serverOptions, unavailableChannel: make(chan healthcheck.Status), } @@ -268,6 +285,13 @@ func (s *Server) initListeners() error { // Start listening on http and private http port concurrently func (s *Server) Start() error { + // initiate rule manager first + if !s.serverOptions.DisableRules { + s.ruleManager.Start() + } else { + zap.S().Info("msg: Rules disabled as rules.disable is set to TRUE") + } + err := s.initListeners() if err != nil { return err @@ -321,3 +345,49 @@ func (s *Server) Start() error { return nil } + +func makeRulesManager( + promConfigPath, + alertManagerURL string, + ruleRepoURL string, + db *sqlx.DB, + ch interfaces.Reader, + disableRules bool) (*rules.Manager, error) { + + // create engine + pqle, err := pqle.FromConfigPath(promConfigPath) + if err != nil { + return nil, fmt.Errorf("failed to create pql engine : %v", err) + } + + // notifier opts + notifierOpts := am.NotifierOptions{ + QueueCapacity: 10000, + Timeout: 1 * time.Second, + AlertManagerURLs: []string{alertManagerURL}, + } + + // create manager opts + managerOpts := &rules.ManagerOptions{ + NotifierOpts: notifierOpts, + Queriers: &rules.Queriers{ + PqlEngine: pqle, + Ch: ch.GetConn(), + }, + RepoURL: ruleRepoURL, + DBConn: db, + Context: context.Background(), + Logger: nil, + DisableRules: disableRules, + } + + // create Manager + manager, err := rules.NewManager(managerOpts) + if err != nil { + return nil, fmt.Errorf("rule manager error: %v", err) + } + + zap.S().Info("rules manager is ready") + + return manager, nil +} diff --git a/pkg/query-service/constants/constants.go b/pkg/query-service/constants/constants.go index 3de0f6ae7a..8f590d81ba 100644 --- a/pkg/query-service/constants/constants.go +++ b/pkg/query-service/constants/constants.go @@ -57,12 +57,18 @@ const ( MsgSystem = "msgSystem" MsgOperation = "msgOperation" Timestamp = "timestamp" + RPCMethod = "rpcMethod" + ResponseStatusCode = "responseStatusCode" Descending = "descending" Ascending = "ascending" ContextTimeout = 60 // seconds StatusPending = "pending" StatusFailed = "failed" StatusSuccess = "success" + ExceptionType = "exceptionType" + ExceptionCount = "exceptionCount" + LastSeen = "lastSeen" + FirstSeen = "firstSeen" Attributes = "attributes" Resources = "resources" Static = "static" @@ -75,6 +81,12 @@ const ( SIGNOZ_TIMESERIES_TABLENAME = "time_series_v2" ) +// alert related constants +const ( + // AlertHelpPage is used in case default alert repo url is not set + AlertHelpPage = "https://signoz.io/docs/userguide/alerts-management/#generator-url" +) + func GetOrDefaultEnv(key string, fallback string) string { v := os.Getenv(key) if len(v) == 0 { diff --git a/pkg/query-service/integrations/alertManager/manager.go b/pkg/query-service/integrations/alertManager/manager.go index 47dc96f366..21b58174f9 100644 --- a/pkg/query-service/integrations/alertManager/manager.go +++ b/pkg/query-service/integrations/alertManager/manager.go @@ -5,35 +5,44 @@ import ( "bytes" "encoding/json" "fmt" - "net/http" - "go.signoz.io/query-service/constants" "go.signoz.io/query-service/model" "go.uber.org/zap" + "net/http" + neturl "net/url" ) const contentType = "application/json" type Manager interface { + URL() *neturl.URL + URLPath(path string) *neturl.URL AddRoute(receiver *Receiver) *model.ApiError EditRoute(receiver *Receiver) *model.ApiError DeleteRoute(name string) *model.ApiError TestReceiver(receiver *Receiver) *model.ApiError } -func New(url string) Manager { +func New(url string) (Manager, error) { if url == "" { url = constants.GetAlertManagerApiPrefix() } - return &manager{ - url: url, + urlParsed, err := neturl.Parse(url) + if err != nil { + return nil, err } + + return &manager{ + url: url, + parsedURL: urlParsed, + }, nil } type manager struct { - url string + url string + parsedURL *neturl.URL } func prepareAmChannelApiURL() string { @@ -52,6 +61,19 @@ func prepareTestApiURL() string { return fmt.Sprintf("%s%s", basePath, "v1/testReceiver") } +func (m *manager) URL() *neturl.URL { + return m.parsedURL +} + +func (m *manager) URLPath(path string) *neturl.URL { + upath, err := neturl.Parse(path) + if err != nil { + return nil + } + + return m.parsedURL.ResolveReference(upath) +} + func (m *manager) AddRoute(receiver *Receiver) *model.ApiError { receiverString, _ := json.Marshal(receiver) diff --git a/pkg/query-service/integrations/alertManager/model.go b/pkg/query-service/integrations/alertManager/model.go index 705b0492fd..bb709e430f 100644 --- a/pkg/query-service/integrations/alertManager/model.go +++ b/pkg/query-service/integrations/alertManager/model.go @@ -1,5 +1,11 @@ package alertManager +import ( + "fmt" + "go.signoz.io/query-service/utils/labels" + "time" +) + // Receiver configuration provides configuration on how to contact a receiver. type Receiver struct { // A unique identifier for this receiver. @@ -19,4 +25,51 @@ type Receiver struct { type ReceiverResponse struct { Status string `json:"status"` Data Receiver `json:"data"` -} \ No newline at end of file +} + +// Alert is a generic representation of an alert in the Prometheus eco-system. +type Alert struct { + // Label value pairs for purpose of aggregation, matching, and disposition + // dispatching. This must minimally include an "alertname" label. + Labels labels.BaseLabels `json:"labels"` + + // Extra key/value information which does not define alert identity. + Annotations labels.BaseLabels `json:"annotations"` + + // The known time range for this alert. Both ends are optional. + StartsAt time.Time `json:"startsAt,omitempty"` + EndsAt time.Time `json:"endsAt,omitempty"` + GeneratorURL string `json:"generatorURL,omitempty"` +} + +// Name returns the name of the alert. It is equivalent to the "alertname" label. +func (a *Alert) Name() string { + return a.Labels.Get(labels.AlertNameLabel) +} + +// Hash returns a hash over the alert. It is equivalent to the alert labels hash. +func (a *Alert) Hash() uint64 { + return a.Labels.Hash() +} + +func (a *Alert) String() string { + s := fmt.Sprintf("%s[%s]", a.Name(), fmt.Sprintf("%016x", a.Hash())[:7]) + if a.Resolved() { + return s + "[resolved]" + } + return s + "[active]" +} + +// Resolved returns true iff the activity interval ended in the past. +func (a *Alert) Resolved() bool { + return a.ResolvedAt(time.Now()) +} + +// ResolvedAt returns true off the activity interval ended before +// the given timestamp. +func (a *Alert) ResolvedAt(ts time.Time) bool { + if a.EndsAt.IsZero() { + return false + } + return !a.EndsAt.After(ts) +} diff --git a/pkg/query-service/integrations/alertManager/notifier.go b/pkg/query-service/integrations/alertManager/notifier.go new file mode 100644 index 0000000000..148d489ed0 --- /dev/null +++ b/pkg/query-service/integrations/alertManager/notifier.go @@ -0,0 +1,310 @@ +package alertManager + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "sync/atomic" + + "net/http" + "net/url" + "sync" + "time" + + old_ctx "golang.org/x/net/context" + + "github.com/go-kit/kit/log" + "github.com/go-kit/log/level" + + "go.uber.org/zap" + "golang.org/x/net/context/ctxhttp" +) + +const ( + alertPushEndpoint = "v1/alerts" + contentTypeJSON = "application/json" +) + +// Notifier is responsible for dispatching alert notifications to an +// alert manager service. +type Notifier struct { + queue []*Alert + opts *NotifierOptions + + more chan struct{} + mtx sync.RWMutex + ctx context.Context + cancel func() + + alertmanagers *alertmanagerSet + logger log.Logger +} + +// NotifierOptions are the configurable parameters of a Handler. +type NotifierOptions struct { + QueueCapacity int + // Used for sending HTTP requests to the Alertmanager. + Do func(ctx old_ctx.Context, client *http.Client, req *http.Request) (*http.Response, error) + // List of alert manager urls + AlertManagerURLs []string + // timeout limit on requests + Timeout time.Duration +} + +func (opts *NotifierOptions) String() string { + var urls string + for _, u := range opts.AlertManagerURLs { + urls = fmt.Sprintf("%s %s", urls, u) + } + return urls +} + +// todo(amol): add metrics + +func NewNotifier(o *NotifierOptions, logger log.Logger) (*Notifier, error) { + ctx, cancel := context.WithCancel(context.Background()) + if o.Do == nil { + o.Do = ctxhttp.Do + } + if logger == nil { + logger = log.NewNopLogger() + } + + n := &Notifier{ + queue: make([]*Alert, 0, o.QueueCapacity), + ctx: ctx, + cancel: cancel, + more: make(chan struct{}, 1), + opts: o, + logger: logger, + } + timeout := o.Timeout + + if int64(timeout) == 0 { + timeout = time.Duration(30 * time.Second) + } + + amset, err := newAlertmanagerSet(o.AlertManagerURLs, timeout, logger) + if err != nil { + zap.S().Errorf("failed to parse alert manager urls") + return n, err + } + n.alertmanagers = amset + zap.S().Info("Starting notifier with alert manager:", o.AlertManagerURLs) + return n, nil +} + +const maxBatchSize = 64 + +func (n *Notifier) queueLen() int { + n.mtx.RLock() + defer n.mtx.RUnlock() + + return len(n.queue) +} + +func (n *Notifier) nextBatch() []*Alert { + n.mtx.Lock() + defer n.mtx.Unlock() + + var alerts []*Alert + + if len(n.queue) > maxBatchSize { + alerts = append(make([]*Alert, 0, maxBatchSize), n.queue[:maxBatchSize]...) + n.queue = n.queue[maxBatchSize:] + } else { + alerts = append(make([]*Alert, 0, len(n.queue)), n.queue...) + n.queue = n.queue[:0] + } + + return alerts +} + +// Run dispatches notifications continuously. +func (n *Notifier) Run() { + zap.S().Info("msg: Initiating alert notifier...") + for { + select { + case <-n.ctx.Done(): + return + case <-n.more: + } + alerts := n.nextBatch() + + if !n.sendAll(alerts...) { + zap.S().Warn("msg: dropped alerts", "\t count:", len(alerts)) + // n.metrics.dropped.Add(float64(len(alerts))) + } + // If the queue still has items left, kick off the next iteration. + if n.queueLen() > 0 { + n.setMore() + } + } +} + +// Send queues the given notification requests for processing. +// Panics if called on a handler that is not running. +func (n *Notifier) Send(alerts ...*Alert) { + n.mtx.Lock() + defer n.mtx.Unlock() + + // Queue capacity should be significantly larger than a single alert + // batch could be. + if d := len(alerts) - n.opts.QueueCapacity; d > 0 { + alerts = alerts[d:] + + level.Warn(n.logger).Log("msg", "Alert batch larger than queue capacity, dropping alerts", "num_dropped", d) + //n.metrics.dropped.Add(float64(d)) + } + + // If the queue is full, remove the oldest alerts in favor + // of newer ones. + if d := (len(n.queue) + len(alerts)) - n.opts.QueueCapacity; d > 0 { + n.queue = n.queue[d:] + + level.Warn(n.logger).Log("msg", "Alert notification queue full, dropping alerts", "num_dropped", d) + //n.metrics.dropped.Add(float64(d)) + } + n.queue = append(n.queue, alerts...) + + // Notify sending goroutine that there are alerts to be processed. + n.setMore() +} + +// setMore signals that the alert queue has items. +func (n *Notifier) setMore() { + // If we cannot send on the channel, it means the signal already exists + // and has not been consumed yet. + select { + case n.more <- struct{}{}: + default: + } +} + +// Alertmanagers returns a slice of Alertmanager URLs. +func (n *Notifier) Alertmanagers() []*url.URL { + n.mtx.RLock() + amset := n.alertmanagers + n.mtx.RUnlock() + + var res []*url.URL + + amset.mtx.RLock() + for _, am := range amset.ams { + res = append(res, am.URLPath(alertPushEndpoint)) + } + amset.mtx.RUnlock() + + return res +} + +// sendAll sends the alerts to all configured Alertmanagers concurrently. +// It returns true if the alerts could be sent successfully to at least one Alertmanager. +func (n *Notifier) sendAll(alerts ...*Alert) bool { + + b, err := json.Marshal(alerts) + if err != nil { + zap.S().Errorf("msg", "Encoding alerts failed", "err", err) + return false + } + + n.mtx.RLock() + ams := n.alertmanagers + n.mtx.RUnlock() + + var ( + wg sync.WaitGroup + numSuccess uint64 + ) + + ams.mtx.RLock() + + for _, am := range ams.ams { + wg.Add(1) + + ctx, cancel := context.WithTimeout(n.ctx, time.Duration(ams.timeout)) + defer cancel() + + go func(ams *alertmanagerSet, am Manager) { + u := am.URLPath(alertPushEndpoint).String() + if err := n.sendOne(ctx, ams.client, u, b); err != nil { + zap.S().Errorf("alertmanager", u, "count", len(alerts), "msg", "Error calling alert API", "err", err) + } else { + atomic.AddUint64(&numSuccess, 1) + } + // n.metrics.latency.WithLabelValues(u).Observe(time.Since(begin).Seconds()) + // n.metrics.sent.WithLabelValues(u).Add(float64(len(alerts))) + + wg.Done() + }(ams, am) + } + ams.mtx.RUnlock() + + wg.Wait() + + return numSuccess > 0 +} + +func (n *Notifier) sendOne(ctx context.Context, c *http.Client, url string, b []byte) error { + req, err := http.NewRequest("POST", url, bytes.NewReader(b)) + if err != nil { + return err + } + req.Header.Set("Content-Type", contentTypeJSON) + resp, err := n.opts.Do(ctx, c, req) + if err != nil { + return err + } + defer resp.Body.Close() + + // Any HTTP status 2xx is OK. + if resp.StatusCode/100 != 2 { + return fmt.Errorf("bad response status %v", resp.Status) + } + return err +} + +// Stop shuts down the notification handler. +func (n *Notifier) Stop() { + level.Info(n.logger).Log("msg", "Stopping notification manager...") + n.cancel() +} + +// alertmanagerSet contains a set of Alertmanagers discovered via a group of service +// discovery definitions that have a common configuration on how alerts should be sent. +type alertmanagerSet struct { + urls []string + client *http.Client + timeout time.Duration + mtx sync.RWMutex + ams []Manager + + logger log.Logger +} + +func newAlertmanagerSet(urls []string, timeout time.Duration, logger log.Logger) (*alertmanagerSet, error) { + client := &http.Client{} + + s := &alertmanagerSet{ + client: client, + urls: urls, + logger: logger, + timeout: timeout, + } + + ams := []Manager{} + for _, u := range urls { + am, err := New(u) + if err != nil { + level.Error(s.logger).Log(fmt.Sprintf("invalid alert manager url %s: %s", u, err)) + } else { + ams = append(ams, am) + } + } + if len(ams) == 0 { + return s, fmt.Errorf("no alert managers") + } + s.ams = ams + return s, nil +} diff --git a/pkg/query-service/interfaces/interface.go b/pkg/query-service/interfaces/interface.go index c8d9b1cfcf..8519b09b04 100644 --- a/pkg/query-service/interfaces/interface.go +++ b/pkg/query-service/interfaces/interface.go @@ -3,6 +3,7 @@ package interfaces import ( "context" + "github.com/ClickHouse/clickhouse-go/v2" "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/util/stats" am "go.signoz.io/query-service/integrations/alertManager" @@ -16,12 +17,6 @@ type Reader interface { CreateChannel(receiver *am.Receiver) (*am.Receiver, *model.ApiError) EditChannel(receiver *am.Receiver, id string) (*am.Receiver, *model.ApiError) - GetRule(id string) (*model.RuleResponseItem, *model.ApiError) - ListRulesFromProm() (*model.AlertDiscovery, *model.ApiError) - CreateRule(alert string) *model.ApiError - EditRule(alert string, id string) *model.ApiError - DeleteRule(id string) *model.ApiError - GetInstantQueryMetricsResult(ctx context.Context, query *model.InstantQueryMetricsParams) (*promql.Result, *stats.QueryStats, *model.ApiError) GetQueryRangeResult(ctx context.Context, query *model.QueryRangeParams) (*promql.Result, *stats.QueryStats, *model.ApiError) GetServiceOverview(ctx context.Context, query *model.GetServiceOverviewParams) (*[]model.ServiceOverviewItem, *model.ApiError) @@ -41,9 +36,12 @@ type Reader interface { GetFilteredSpans(ctx context.Context, query *model.GetFilteredSpansParams) (*model.GetFilterSpansResponse, *model.ApiError) GetFilteredSpansAggregates(ctx context.Context, query *model.GetFilteredSpanAggregatesParams) (*model.GetFilteredSpansAggregatesResponse, *model.ApiError) - GetErrors(ctx context.Context, params *model.GetErrorsParams) (*[]model.Error, *model.ApiError) - GetErrorForId(ctx context.Context, params *model.GetErrorParams) (*model.ErrorWithSpan, *model.ApiError) - GetErrorForType(ctx context.Context, params *model.GetErrorParams) (*model.ErrorWithSpan, *model.ApiError) + ListErrors(ctx context.Context, params *model.ListErrorsParams) (*[]model.Error, *model.ApiError) + CountErrors(ctx context.Context, params *model.CountErrorsParams) (uint64, *model.ApiError) + GetErrorFromErrorID(ctx context.Context, params *model.GetErrorParams) (*model.ErrorWithSpan, *model.ApiError) + GetErrorFromGroupID(ctx context.Context, params *model.GetErrorParams) (*model.ErrorWithSpan, *model.ApiError) + GetNextPrevErrorIDs(ctx context.Context, params *model.GetErrorParams) (*model.NextPrevErrorIDs, *model.ApiError) + // Search Interfaces SearchTraces(ctx context.Context, traceID string) (*[]model.SearchSpansResult, error) @@ -65,4 +63,7 @@ type Reader interface { UpdateLogField(ctx context.Context, field *model.UpdateField) *model.ApiError GetLogs(ctx context.Context, params *model.LogsFilterParams) (*[]model.GetLogsResponse, *model.ApiError) TailLogs(ctx context.Context, client *model.LogsTailClient) *model.ApiError + + // Connection needed for rules, not ideal but required + GetConn() clickhouse.Conn } diff --git a/pkg/query-service/main.go b/pkg/query-service/main.go index b837560531..e23ff8785b 100644 --- a/pkg/query-service/main.go +++ b/pkg/query-service/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "flag" "os" "os/signal" "syscall" @@ -25,6 +26,18 @@ func initZapLog() *zap.Logger { } func main() { + var promConfigPath string + + // disables rule execution but allows change to the rule definition + var disableRules bool + + // the url used to build link in the alert messages in slack and other systems + var ruleRepoURL string + + flag.StringVar(&promConfigPath, "config", "./config/prometheus.yml", "(prometheus config to read metrics)") + flag.BoolVar(&disableRules, "rules.disable", false, "(disable rule evaluation)") + flag.StringVar(&ruleRepoURL, "rules.repo-url", constants.AlertHelpPage, "(host address used to build rule link in alert messages)") + flag.Parse() loggerMgr := initZapLog() zap.ReplaceGlobals(loggerMgr) @@ -35,7 +48,10 @@ func main() { serverOptions := &app.ServerOptions{ HTTPHostPort: constants.HTTPHostPort, + PromConfigPath: promConfigPath, PrivateHostPort: constants.PrivateHostPort, + DisableRules: disableRules, + RuleRepoURL: ruleRepoURL, } // Read the jwt secret key diff --git a/pkg/query-service/model/queryParams.go b/pkg/query-service/model/queryParams.go index 1df7813679..d63a07eb71 100644 --- a/pkg/query-service/model/queryParams.go +++ b/pkg/query-service/model/queryParams.go @@ -181,94 +181,102 @@ type TagQuery struct { } type GetFilteredSpansParams struct { - ServiceName []string `json:"serviceName"` - Operation []string `json:"operation"` - Kind string `json:"kind"` - Status []string `json:"status"` - HttpRoute []string `json:"httpRoute"` - HttpCode []string `json:"httpCode"` - HttpUrl []string `json:"httpUrl"` - HttpHost []string `json:"httpHost"` - HttpMethod []string `json:"httpMethod"` - Component []string `json:"component"` - StartStr string `json:"start"` - EndStr string `json:"end"` - MinDuration string `json:"minDuration"` - MaxDuration string `json:"maxDuration"` - Limit int64 `json:"limit"` - OrderParam string `json:"orderParam"` - Order string `json:"order"` - Offset int64 `json:"offset"` - Tags []TagQuery `json:"tags"` - Exclude []string `json:"exclude"` - Start *time.Time - End *time.Time + ServiceName []string `json:"serviceName"` + Operation []string `json:"operation"` + Kind string `json:"kind"` + Status []string `json:"status"` + HttpRoute []string `json:"httpRoute"` + HttpCode []string `json:"httpCode"` + HttpUrl []string `json:"httpUrl"` + HttpHost []string `json:"httpHost"` + HttpMethod []string `json:"httpMethod"` + Component []string `json:"component"` + RPCMethod []string `json:"rpcMethod"` + ResponseStatusCode []string `json:"responseStatusCode"` + StartStr string `json:"start"` + EndStr string `json:"end"` + MinDuration string `json:"minDuration"` + MaxDuration string `json:"maxDuration"` + Limit int64 `json:"limit"` + OrderParam string `json:"orderParam"` + Order string `json:"order"` + Offset int64 `json:"offset"` + Tags []TagQuery `json:"tags"` + Exclude []string `json:"exclude"` + Start *time.Time + End *time.Time } type GetFilteredSpanAggregatesParams struct { - ServiceName []string `json:"serviceName"` - Operation []string `json:"operation"` - Kind string `json:"kind"` - Status []string `json:"status"` - HttpRoute []string `json:"httpRoute"` - HttpCode []string `json:"httpCode"` - HttpUrl []string `json:"httpUrl"` - HttpHost []string `json:"httpHost"` - HttpMethod []string `json:"httpMethod"` - Component []string `json:"component"` - MinDuration string `json:"minDuration"` - MaxDuration string `json:"maxDuration"` - Tags []TagQuery `json:"tags"` - StartStr string `json:"start"` - EndStr string `json:"end"` - StepSeconds int `json:"step"` - Dimension string `json:"dimension"` - AggregationOption string `json:"aggregationOption"` - GroupBy string `json:"groupBy"` - Function string `json:"function"` - Exclude []string `json:"exclude"` - Start *time.Time - End *time.Time + ServiceName []string `json:"serviceName"` + Operation []string `json:"operation"` + Kind string `json:"kind"` + Status []string `json:"status"` + HttpRoute []string `json:"httpRoute"` + HttpCode []string `json:"httpCode"` + HttpUrl []string `json:"httpUrl"` + HttpHost []string `json:"httpHost"` + HttpMethod []string `json:"httpMethod"` + Component []string `json:"component"` + RPCMethod []string `json:"rpcMethod"` + ResponseStatusCode []string `json:"responseStatusCode"` + MinDuration string `json:"minDuration"` + MaxDuration string `json:"maxDuration"` + Tags []TagQuery `json:"tags"` + StartStr string `json:"start"` + EndStr string `json:"end"` + StepSeconds int `json:"step"` + Dimension string `json:"dimension"` + AggregationOption string `json:"aggregationOption"` + GroupBy string `json:"groupBy"` + Function string `json:"function"` + Exclude []string `json:"exclude"` + Start *time.Time + End *time.Time } type SpanFilterParams struct { - Status []string `json:"status"` - ServiceName []string `json:"serviceName"` - HttpRoute []string `json:"httpRoute"` - HttpCode []string `json:"httpCode"` - HttpUrl []string `json:"httpUrl"` - HttpHost []string `json:"httpHost"` - HttpMethod []string `json:"httpMethod"` - Component []string `json:"component"` - Operation []string `json:"operation"` - GetFilters []string `json:"getFilters"` - Exclude []string `json:"exclude"` - MinDuration string `json:"minDuration"` - MaxDuration string `json:"maxDuration"` - StartStr string `json:"start"` - EndStr string `json:"end"` - Start *time.Time - End *time.Time + Status []string `json:"status"` + ServiceName []string `json:"serviceName"` + HttpRoute []string `json:"httpRoute"` + HttpCode []string `json:"httpCode"` + HttpUrl []string `json:"httpUrl"` + HttpHost []string `json:"httpHost"` + HttpMethod []string `json:"httpMethod"` + Component []string `json:"component"` + Operation []string `json:"operation"` + RPCMethod []string `json:"rpcMethod"` + ResponseStatusCode []string `json:"responseStatusCode"` + GetFilters []string `json:"getFilters"` + Exclude []string `json:"exclude"` + MinDuration string `json:"minDuration"` + MaxDuration string `json:"maxDuration"` + StartStr string `json:"start"` + EndStr string `json:"end"` + Start *time.Time + End *time.Time } type TagFilterParams struct { - Status []string `json:"status"` - ServiceName []string `json:"serviceName"` - HttpRoute []string `json:"httpRoute"` - HttpCode []string `json:"httpCode"` - HttpUrl []string `json:"httpUrl"` - HttpHost []string `json:"httpHost"` - HttpMethod []string `json:"httpMethod"` - Component []string `json:"component"` - Operation []string `json:"operation"` - Exclude []string `json:"exclude"` - MinDuration string `json:"minDuration"` - MaxDuration string `json:"maxDuration"` - StartStr string `json:"start"` - EndStr string `json:"end"` - TagKey string `json:"tagKey"` - Start *time.Time - End *time.Time + Status []string `json:"status"` + ServiceName []string `json:"serviceName"` + HttpRoute []string `json:"httpRoute"` + HttpCode []string `json:"httpCode"` + HttpUrl []string `json:"httpUrl"` + HttpHost []string `json:"httpHost"` + HttpMethod []string `json:"httpMethod"` + Component []string `json:"component"` + Operation []string `json:"operation"` + RPCMethod []string `json:"rpcMethod"` + ResponseStatusCode []string `json:"responseStatusCode"` + Exclude []string `json:"exclude"` + MinDuration string `json:"minDuration"` + MaxDuration string `json:"maxDuration"` + StartStr string `json:"start"` + EndStr string `json:"end"` + TagKey string `json:"tagKey"` + Start *time.Time + End *time.Time } type TTLParams struct { @@ -282,15 +290,24 @@ type GetTTLParams struct { Type string } -type GetErrorsParams struct { +type ListErrorsParams struct { + Start *time.Time + End *time.Time + Limit int64 + OrderParam string + Order string + Offset int64 +} + +type CountErrorsParams struct { Start *time.Time End *time.Time } type GetErrorParams struct { - ErrorType string - ErrorID string - ServiceName string + GroupID string + ErrorID string + Timestamp *time.Time } type FilterItem struct { diff --git a/pkg/query-service/model/response.go b/pkg/query-service/model/response.go index 7f6020c114..8efb4f1069 100644 --- a/pkg/query-service/model/response.go +++ b/pkg/query-service/model/response.go @@ -119,18 +119,20 @@ type SearchSpansResult struct { } type GetFilterSpansResponseItem struct { - Timestamp time.Time `ch:"timestamp" json:"timestamp"` - SpanID string `ch:"spanID" json:"spanID"` - TraceID string `ch:"traceID" json:"traceID"` - ServiceName string `ch:"serviceName" json:"serviceName"` - Operation string `ch:"name" json:"operation"` - DurationNano uint64 `ch:"durationNano" json:"durationNano"` - HttpCode string `ch:"httpCode"` - HttpMethod string `ch:"httpMethod"` - GRPCode string `ch:"gRPCCode"` - GRPMethod string `ch:"gRPCMethod"` - StatusCode string `json:"statusCode"` - Method string `json:"method"` + Timestamp time.Time `ch:"timestamp" json:"timestamp"` + SpanID string `ch:"spanID" json:"spanID"` + TraceID string `ch:"traceID" json:"traceID"` + ServiceName string `ch:"serviceName" json:"serviceName"` + Operation string `ch:"name" json:"operation"` + DurationNano uint64 `ch:"durationNano" json:"durationNano"` + HttpCode string `ch:"httpCode"` + HttpMethod string `ch:"httpMethod"` + GRPCode string `ch:"gRPCCode"` + GRPMethod string `ch:"gRPCMethod"` + StatusCode string `json:"statusCode"` + Method string `json:"method"` + ResponseStatusCode string `ch:"responseStatusCode"` + RPCMethod string `ch:"rpcMethod"` } type GetFilterSpansResponse struct { @@ -304,6 +306,16 @@ type DBResponseHttpMethod struct { Count uint64 `ch:"count"` } +type DBResponseStatusCodeMethod struct { + ResponseStatusCode string `ch:"responseStatusCode"` + Count uint64 `ch:"count"` +} + +type DBResponseRPCMethod struct { + RPCMethod string `ch:"rpcMethod"` + Count uint64 `ch:"count"` +} + type DBResponseHttpHost struct { HttpHost string `ch:"httpHost"` Count uint64 `ch:"count"` @@ -324,16 +336,18 @@ type DBResponseTotal struct { } type SpanFiltersResponse struct { - ServiceName map[string]uint64 `json:"serviceName"` - Status map[string]uint64 `json:"status"` - Duration map[string]uint64 `json:"duration"` - Operation map[string]uint64 `json:"operation"` - HttpCode map[string]uint64 `json:"httpCode"` - HttpUrl map[string]uint64 `json:"httpUrl"` - HttpMethod map[string]uint64 `json:"httpMethod"` - HttpRoute map[string]uint64 `json:"httpRoute"` - HttpHost map[string]uint64 `json:"httpHost"` - Component map[string]uint64 `json:"component"` + ServiceName map[string]uint64 `json:"serviceName"` + Status map[string]uint64 `json:"status"` + Duration map[string]uint64 `json:"duration"` + Operation map[string]uint64 `json:"operation"` + HttpCode map[string]uint64 `json:"httpCode"` + ResponseStatusCode map[string]uint64 `json:"responseStatusCode"` + RPCMethod map[string]uint64 `json:"rpcMethod"` + HttpUrl map[string]uint64 `json:"httpUrl"` + HttpMethod map[string]uint64 `json:"httpMethod"` + HttpRoute map[string]uint64 `json:"httpRoute"` + HttpHost map[string]uint64 `json:"httpHost"` + Component map[string]uint64 `json:"component"` } type Error struct { ExceptionType string `json:"exceptionType" ch:"exceptionType"` @@ -342,20 +356,36 @@ type Error struct { LastSeen time.Time `json:"lastSeen" ch:"lastSeen"` FirstSeen time.Time `json:"firstSeen" ch:"firstSeen"` ServiceName string `json:"serviceName" ch:"serviceName"` + GroupID string `json:"groupID" ch:"groupID"` } type ErrorWithSpan struct { ErrorID string `json:"errorId" ch:"errorID"` ExceptionType string `json:"exceptionType" ch:"exceptionType"` ExceptionStacktrace string `json:"exceptionStacktrace" ch:"exceptionStacktrace"` - ExceptionEscaped string `json:"exceptionEscaped" ch:"exceptionEscaped"` + ExceptionEscaped bool `json:"exceptionEscaped" ch:"exceptionEscaped"` ExceptionMsg string `json:"exceptionMessage" ch:"exceptionMessage"` Timestamp time.Time `json:"timestamp" ch:"timestamp"` SpanID string `json:"spanID" ch:"spanID"` TraceID string `json:"traceID" ch:"traceID"` ServiceName string `json:"serviceName" ch:"serviceName"` - NewerErrorID string `json:"newerErrorId" ch:"newerErrorId"` - OlderErrorID string `json:"olderErrorId" ch:"olderErrorId"` + GroupID string `json:"groupID" ch:"groupID"` +} + +type NextPrevErrorIDsDBResponse struct { + NextErrorID string `ch:"nextErrorID"` + NextTimestamp time.Time `ch:"nextTimestamp"` + PrevErrorID string `ch:"prevErrorID"` + PrevTimestamp time.Time `ch:"prevTimestamp"` + Timestamp time.Time `ch:"timestamp"` +} + +type NextPrevErrorIDs struct { + NextErrorID string `json:"nextErrorID"` + NextTimestamp time.Time `json:"nextTimestamp"` + PrevErrorID string `json:"prevErrorID"` + PrevTimestamp time.Time `json:"prevTimestamp"` + GroupID string `json:"groupID"` } type Series struct { diff --git a/pkg/query-service/pqlEngine/engine.go b/pkg/query-service/pqlEngine/engine.go new file mode 100644 index 0000000000..e9a45ad542 --- /dev/null +++ b/pkg/query-service/pqlEngine/engine.go @@ -0,0 +1,85 @@ +package promql + +import ( + "context" + "fmt" + "github.com/go-kit/log" + pmodel "github.com/prometheus/common/model" + plog "github.com/prometheus/common/promlog" + pconfig "github.com/prometheus/prometheus/config" + plabels "github.com/prometheus/prometheus/pkg/labels" + pql "github.com/prometheus/prometheus/promql" + pstorage "github.com/prometheus/prometheus/storage" + premote "github.com/prometheus/prometheus/storage/remote" + "time" +) + +type PqlEngine struct { + engine *pql.Engine + fanoutStorage pstorage.Storage +} + +func FromConfigPath(promConfigPath string) (*PqlEngine, error) { + // load storage path + c, err := pconfig.LoadFile(promConfigPath) + if err != nil { + return nil, fmt.Errorf("couldn't load configuration (--config.file=%q): %v", promConfigPath, err) + } + + return NewPqlEngine(c) +} + +func NewPqlEngine(config *pconfig.Config) (*PqlEngine, error) { + + logLevel := plog.AllowedLevel{} + logLevel.Set("debug") + logger := plog.New(logLevel) + + opts := pql.EngineOpts{ + Logger: log.With(logger, "component", "promql evaluator"), + Reg: nil, + MaxConcurrent: 20, + MaxSamples: 50000000, + Timeout: time.Duration(2 * time.Minute), + } + + e := pql.NewEngine(opts) + startTime := func() (int64, error) { + return int64(pmodel.Latest), nil + } + + remoteStorage := premote.NewStorage(log.With(logger, "component", "remote"), startTime, time.Duration(1*time.Minute)) + fanoutStorage := pstorage.NewFanout(logger, remoteStorage) + + remoteStorage.ApplyConfig(config) + + return &PqlEngine{ + engine: e, + fanoutStorage: fanoutStorage, + }, nil +} + +func (p *PqlEngine) RunAlertQuery(ctx context.Context, qs string, t time.Time) (pql.Vector, error) { + q, err := p.engine.NewInstantQuery(p.fanoutStorage, qs, t) + if err != nil { + return nil, err + } + + res := q.Exec(ctx) + + if res.Err != nil { + return nil, res.Err + } + + switch v := res.Value.(type) { + case pql.Vector: + return v, nil + case pql.Scalar: + return pql.Vector{pql.Sample{ + Point: pql.Point(v), + Metric: plabels.Labels{}, + }}, nil + default: + return nil, fmt.Errorf("rule result is not a vector or scalar") + } +} diff --git a/pkg/query-service/rules/alerting.go b/pkg/query-service/rules/alerting.go new file mode 100644 index 0000000000..a4768b4036 --- /dev/null +++ b/pkg/query-service/rules/alerting.go @@ -0,0 +1,200 @@ +package rules + +import ( + "encoding/json" + "github.com/pkg/errors" + "go.signoz.io/query-service/model" + "go.signoz.io/query-service/utils/labels" + "time" +) + +// how long before re-sending the alert +const resolvedRetention = 15 * time.Minute + +const ( + // AlertMetricName is the metric name for synthetic alert timeseries. + alertMetricName = "ALERTS" + + // AlertForStateMetricName is the metric name for 'for' state of alert. + alertForStateMetricName = "ALERTS_FOR_STATE" +) + +type RuleType string + +const ( + RuleTypeThreshold = "threshold_rule" + RuleTypeProm = "promql_rule" +) + +type RuleHealth string + +const ( + HealthUnknown RuleHealth = "unknown" + HealthGood RuleHealth = "ok" + HealthBad RuleHealth = "err" +) + +// AlertState denotes the state of an active alert. +type AlertState int + +const ( + StateInactive AlertState = iota + StatePending + StateFiring +) + +func (s AlertState) String() string { + switch s { + case StateInactive: + return "inactive" + case StatePending: + return "pending" + case StateFiring: + return "firing" + } + panic(errors.Errorf("unknown alert state: %d", s)) +} + +type Alert struct { + State AlertState + + Labels labels.BaseLabels + Annotations labels.BaseLabels + + GeneratorURL string + + Value float64 + ActiveAt time.Time + FiredAt time.Time + ResolvedAt time.Time + LastSentAt time.Time + ValidUntil time.Time +} + +// todo(amol): need to review this with ankit +func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool { + if a.State == StatePending { + return false + } + + // if an alert has been resolved since the last send, resend it + if a.ResolvedAt.After(a.LastSentAt) { + return true + } + + return a.LastSentAt.Add(resendDelay).Before(ts) +} + +type NamedAlert struct { + Name string + *Alert +} + +type CompareOp string + +const ( + CompareOpNone CompareOp = "0" + ValueIsAbove CompareOp = "1" + ValueIsBelow CompareOp = "2" + ValueIsEq CompareOp = "3" + ValueIsNotEq CompareOp = "4" +) + +func ResolveCompareOp(cop CompareOp) string { + switch cop { + case ValueIsAbove: + return ">" + case ValueIsBelow: + return "<" + case ValueIsEq: + return "==" + case ValueIsNotEq: + return "!=" + } + return "" +} + +type MatchType string + +const ( + MatchTypeNone MatchType = "0" + AtleastOnce MatchType = "1" + AllTheTimes MatchType = "2" + OnAverage MatchType = "3" + InTotal MatchType = "4" +) + +type RuleCondition struct { + CompositeMetricQuery *model.CompositeMetricQuery `json:"compositeMetricQuery,omitempty" yaml:"compositeMetricQuery,omitempty"` + CompareOp CompareOp `yaml:"op,omitempty" json:"op,omitempty"` + Target *float64 `yaml:"target,omitempty" json:"target,omitempty"` + MatchType `json:"matchType,omitempty"` +} + +func (rc *RuleCondition) IsValid() bool { + + if rc.CompositeMetricQuery == nil { + return false + } + + if rc.QueryType() == model.QUERY_BUILDER { + if rc.Target == nil { + return false + } + if rc.CompareOp == "" { + return false + } + } + if rc.QueryType() == model.PROM { + + if len(rc.CompositeMetricQuery.PromQueries) == 0 { + return false + } + } + return true +} + +// QueryType is a short hand method to get query type +func (rc *RuleCondition) QueryType() model.QueryType { + if rc.CompositeMetricQuery != nil { + return rc.CompositeMetricQuery.QueryType + } + return 0 +} + +// String is useful in printing rule condition in logs +func (rc *RuleCondition) String() string { + if rc == nil { + return "" + } + data, _ := json.Marshal(*rc) + return string(data) +} + +type Duration time.Duration + +func (d Duration) MarshalJSON() ([]byte, error) { + return json.Marshal(time.Duration(d).String()) +} + +func (d *Duration) UnmarshalJSON(b []byte) error { + var v interface{} + if err := json.Unmarshal(b, &v); err != nil { + return err + } + switch value := v.(type) { + case float64: + *d = Duration(time.Duration(value)) + return nil + case string: + tmp, err := time.ParseDuration(value) + if err != nil { + return err + } + *d = Duration(tmp) + + return nil + default: + return errors.New("invalid duration") + } +} diff --git a/pkg/query-service/rules/apiParams.go b/pkg/query-service/rules/apiParams.go new file mode 100644 index 0000000000..6f3b466d11 --- /dev/null +++ b/pkg/query-service/rules/apiParams.go @@ -0,0 +1,230 @@ +package rules + +import ( + "context" + "encoding/json" + "fmt" + "github.com/pkg/errors" + "go.signoz.io/query-service/model" + "go.uber.org/zap" + "time" + "unicode/utf8" + + "go.signoz.io/query-service/utils/times" + "go.signoz.io/query-service/utils/timestamp" + yaml "gopkg.in/yaml.v2" +) + +// this file contains api request and responses to be +// served over http + +// PostableRule is used to create alerting rule from HTTP api +type PostableRule struct { + Alert string `yaml:"alert,omitempty" json:"alert,omitempty"` + Description string `yaml:"description,omitempty" json:"description,omitempty"` + RuleType RuleType `yaml:"ruleType,omitempty" json:"ruleType,omitempty"` + EvalWindow Duration `yaml:"evalWindow,omitempty" json:"evalWindow,omitempty"` + Frequency Duration `yaml:"frequency,omitempty" json:"frequency,omitempty"` + + RuleCondition *RuleCondition `yaml:"condition,omitempty" json:"condition,omitempty"` + Labels map[string]string `yaml:"labels,omitempty" json:"labels,omitempty"` + Annotations map[string]string `yaml:"annotations,omitempty" json:"annotations,omitempty"` + + // Source captures the source url where rule has been created + Source string `json:"source,omitempty"` + + // legacy + Expr string `yaml:"expr,omitempty" json:"expr,omitempty"` + OldYaml string `json:"yaml,omitempty"` +} + +func ParsePostableRule(content []byte) (*PostableRule, []error) { + return parsePostableRule(content, "json") +} + +func parsePostableRule(content []byte, kind string) (*PostableRule, []error) { + rule := PostableRule{} + + var err error + if kind == "json" { + if err = json.Unmarshal(content, &rule); err != nil { + zap.S().Debugf("postable rule content", string(content), "\t kind:", kind) + return nil, []error{fmt.Errorf("failed to load json")} + } + } else if kind == "yaml" { + if err = yaml.Unmarshal(content, &rule); err != nil { + zap.S().Debugf("postable rule content", string(content), "\t kind:", kind) + return nil, []error{fmt.Errorf("failed to load yaml")} + } + } else { + return nil, []error{fmt.Errorf("invalid data type")} + } + zap.S().Debugf("postable rule(parsed):", rule) + + if rule.RuleCondition == nil && rule.Expr != "" { + // account for legacy rules + rule.RuleType = RuleTypeProm + rule.EvalWindow = Duration(5 * time.Minute) + rule.Frequency = Duration(1 * time.Minute) + rule.RuleCondition = &RuleCondition{ + CompositeMetricQuery: &model.CompositeMetricQuery{ + QueryType: model.PROM, + PromQueries: map[string]*model.PromQuery{ + "A": &model.PromQuery{ + Query: rule.Expr, + }, + }, + }, + } + } + + if rule.EvalWindow == 0 { + rule.EvalWindow = Duration(5 * time.Minute) + } + + if rule.Frequency == 0 { + rule.Frequency = Duration(1 * time.Minute) + } + + if rule.RuleCondition != nil { + if rule.RuleCondition.CompositeMetricQuery.QueryType == model.QUERY_BUILDER { + rule.RuleType = RuleTypeThreshold + } else if rule.RuleCondition.CompositeMetricQuery.QueryType == model.PROM { + rule.RuleType = RuleTypeProm + } + + for qLabel, q := range rule.RuleCondition.CompositeMetricQuery.BuilderQueries { + if q.MetricName != "" && q.Expression == "" { + q.Expression = qLabel + } + } + } + + zap.S().Debugf("postable rule:", rule, "\t condition", rule.RuleCondition.String()) + + if errs := rule.Validate(); len(errs) > 0 { + return nil, errs + } + return &rule, []error{} +} + +func isValidLabelName(ln string) bool { + if len(ln) == 0 { + return false + } + for i, b := range ln { + if !((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || (b >= '0' && b <= '9' && i > 0)) { + return false + } + } + return true +} + +func isValidLabelValue(v string) bool { + return utf8.ValidString(v) +} + +func (r *PostableRule) Validate() (errs []error) { + + if r.RuleCondition == nil { + errs = append(errs, errors.Errorf("rule condition is required")) + } else { + if r.RuleCondition.CompositeMetricQuery == nil { + errs = append(errs, errors.Errorf("composite metric query is required")) + } + } + + if r.RuleType == RuleTypeThreshold { + if r.RuleCondition.Target == nil { + errs = append(errs, errors.Errorf("rule condition missing the threshold")) + } + if r.RuleCondition.CompareOp == "" { + errs = append(errs, errors.Errorf("rule condition missing the compare op")) + } + if r.RuleCondition.MatchType == "" { + errs = append(errs, errors.Errorf("rule condition missing the match option")) + } + } + + for k, v := range r.Labels { + if !isValidLabelName(k) { + errs = append(errs, errors.Errorf("invalid label name: %s", k)) + } + + if !isValidLabelValue(v) { + errs = append(errs, errors.Errorf("invalid label value: %s", v)) + } + } + + for k := range r.Annotations { + if !isValidLabelName(k) { + errs = append(errs, errors.Errorf("invalid annotation name: %s", k)) + } + } + + errs = append(errs, testTemplateParsing(r)...) + return errs +} + +func testTemplateParsing(rl *PostableRule) (errs []error) { + if rl.Alert == "" { + // Not an alerting rule. + return errs + } + + // Trying to parse templates. + tmplData := AlertTemplateData(make(map[string]string), 0) + defs := "{{$labels := .Labels}}{{$value := .Value}}" + parseTest := func(text string) error { + tmpl := NewTemplateExpander( + context.TODO(), + defs+text, + "__alert_"+rl.Alert, + tmplData, + times.Time(timestamp.FromTime(time.Now())), + nil, + ) + return tmpl.ParseTest() + } + + // Parsing Labels. + for _, val := range rl.Labels { + err := parseTest(val) + if err != nil { + errs = append(errs, fmt.Errorf("msg=%s", err.Error())) + } + } + + // Parsing Annotations. + for _, val := range rl.Annotations { + err := parseTest(val) + if err != nil { + errs = append(errs, fmt.Errorf("msg=%s", err.Error())) + } + } + + return errs +} + +// GettableRules has info for all stored rules. +type GettableRules struct { + Rules []*GettableRule `json:"rules"` +} + +// GettableRule has info for an alerting rules. +type GettableRule struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations"` + State string `json:"state"` + Alert string `json:"alert"` + // Description string `yaml:"description,omitempty" json:"description,omitempty"` + + Id string `json:"id"` + RuleType RuleType `yaml:"ruleType,omitempty" json:"ruleType,omitempty"` + EvalWindow Duration `yaml:"evalWindow,omitempty" json:"evalWindow,omitempty"` + Frequency Duration `yaml:"frequency,omitempty" json:"frequency,omitempty"` + RuleCondition RuleCondition `yaml:"condition,omitempty" json:"condition,omitempty"` + + // ActiveAt *time.Time `json:"activeAt,omitempty"` + // Value float64 `json:"value"` +} diff --git a/pkg/query-service/rules/db.go b/pkg/query-service/rules/db.go new file mode 100644 index 0000000000..7070f23346 --- /dev/null +++ b/pkg/query-service/rules/db.go @@ -0,0 +1,187 @@ +package rules + +import ( + "fmt" + "github.com/jmoiron/sqlx" + "go.uber.org/zap" + "strconv" + "time" +) + +// Data store to capture user alert rule settings +type RuleDB interface { + // CreateRuleTx stores rule in the db and returns tx and group name (on success) + CreateRuleTx(rule string) (string, Tx, error) + + // EditRuleTx updates the given rule in the db and returns tx and group name (on success) + EditRuleTx(rule string, id string) (string, Tx, error) + + // DeleteRuleTx deletes the given rule in the db and returns tx and group name (on success) + DeleteRuleTx(id string) (string, Tx, error) + + // GetStoredRules fetches the rule definitions from db + GetStoredRules() ([]StoredRule, error) + + // GetStoredRule for a given ID from DB + GetStoredRule(id string) (*StoredRule, error) +} + +type StoredRule struct { + Id int `json:"id" db:"id"` + UpdatedAt time.Time `json:"updated_at" db:"updated_at"` + Data string `json:"data" db:"data"` +} + +type Tx interface { + Commit() error + Rollback() error +} + +type ruleDB struct { + *sqlx.DB +} + +// todo: move init methods for creating tables + +func newRuleDB(db *sqlx.DB) RuleDB { + return &ruleDB{ + db, + } +} + +// CreateRuleTx stores a given rule in db and returns task name, +// sql tx and error (if any) +func (r *ruleDB) CreateRuleTx(rule string) (string, Tx, error) { + + var groupName string + var lastInsertId int64 + + tx, err := r.Begin() + if err != nil { + return groupName, nil, err + } + + stmt, err := tx.Prepare(`INSERT into rules (updated_at, data) VALUES($1,$2);`) + if err != nil { + zap.S().Errorf("Error in preparing statement for INSERT to rules\n", err) + tx.Rollback() + return groupName, nil, err + } + + defer stmt.Close() + + result, err := stmt.Exec(time.Now(), rule) + if err != nil { + zap.S().Errorf("Error in Executing prepared statement for INSERT to rules\n", err) + tx.Rollback() // return an error too, we may want to wrap them + return groupName, nil, err + } + + lastInsertId, _ = result.LastInsertId() + + groupName = prepareTaskName(lastInsertId) + + return groupName, tx, nil + +} + +// EditRuleTx stores a given rule string in database and returns +// task name, sql tx and error (if any) +func (r *ruleDB) EditRuleTx(rule string, id string) (string, Tx, error) { + + var groupName string + idInt, _ := strconv.Atoi(id) + if idInt == 0 { + return groupName, nil, fmt.Errorf("failed to read alert id from parameters") + } + + groupName = prepareTaskName(int64(idInt)) + + // todo(amol): resolve this error - database locked when using + // edit transaction with sqlx + // tx, err := r.Begin() + //if err != nil { + // return groupName, tx, err + //} + stmt, err := r.Prepare(`UPDATE rules SET updated_at=$1, data=$2 WHERE id=$3;`) + if err != nil { + zap.S().Errorf("Error in preparing statement for UPDATE to rules\n", err) + // tx.Rollback() + return groupName, nil, err + } + defer stmt.Close() + + if _, err := stmt.Exec(time.Now(), rule, idInt); err != nil { + zap.S().Errorf("Error in Executing prepared statement for UPDATE to rules\n", err) + // tx.Rollback() // return an error too, we may want to wrap them + return groupName, nil, err + } + return groupName, nil, nil +} + +// DeleteRuleTx deletes a given rule with id and returns +// taskname, sql tx and error (if any) +func (r *ruleDB) DeleteRuleTx(id string) (string, Tx, error) { + + idInt, _ := strconv.Atoi(id) + groupName := prepareTaskName(int64(idInt)) + + // commented as this causes db locked error + // tx, err := r.Begin() + // if err != nil { + // return groupName, tx, err + // } + + stmt, err := r.Prepare(`DELETE FROM rules WHERE id=$1;`) + + if err != nil { + return groupName, nil, err + } + + defer stmt.Close() + + if _, err := stmt.Exec(idInt); err != nil { + zap.S().Errorf("Error in Executing prepared statement for DELETE to rules\n", err) + // tx.Rollback() + return groupName, nil, err + } + + return groupName, nil, nil +} + +func (r *ruleDB) GetStoredRules() ([]StoredRule, error) { + + rules := []StoredRule{} + + query := fmt.Sprintf("SELECT id, updated_at, data FROM rules") + + err := r.Select(&rules, query) + + if err != nil { + zap.S().Debug("Error in processing sql query: ", err) + return nil, err + } + + return rules, nil +} + +func (r *ruleDB) GetStoredRule(id string) (*StoredRule, error) { + intId, err := strconv.Atoi(id) + if err != nil { + return nil, fmt.Errorf("invalid id parameter") + } + + rule := &StoredRule{} + + query := fmt.Sprintf("SELECT id, updated_at, data FROM rules WHERE id=%d", intId) + err = r.Get(rule, query) + + // zap.S().Info(query) + + if err != nil { + zap.S().Error("Error in processing sql query: ", err) + return nil, err + } + + return rule, nil +} diff --git a/pkg/query-service/rules/manager.go b/pkg/query-service/rules/manager.go new file mode 100644 index 0000000000..9a040fdf74 --- /dev/null +++ b/pkg/query-service/rules/manager.go @@ -0,0 +1,595 @@ +package rules + +import ( + "context" + "encoding/json" + "fmt" + "sort" + "strconv" + "strings" + "sync" + "time" + + "github.com/go-kit/log" + + "go.uber.org/zap" + + "github.com/jmoiron/sqlx" + "github.com/pkg/errors" + + // opentracing "github.com/opentracing/opentracing-go" + am "go.signoz.io/query-service/integrations/alertManager" +) + +// namespace for prom metrics +const namespace = "signoz" +const taskNamesuffix = "webAppEditor" + +func ruleIdFromTaskName(n string) string { + return strings.Split(n, "-groupname")[0] +} + +func prepareTaskName(ruleId int64) string { + return fmt.Sprintf("%d-groupname", ruleId) +} + +// ManagerOptions bundles options for the Manager. +type ManagerOptions struct { + NotifierOpts am.NotifierOptions + Queriers *Queriers + + // RepoURL is used to generate a backlink in sent alert messages + RepoURL string + + // rule db conn + DBConn *sqlx.DB + + Context context.Context + Logger log.Logger + ResendDelay time.Duration + DisableRules bool +} + +// The Manager manages recording and alerting rules. +type Manager struct { + opts *ManagerOptions + tasks map[string]Task + rules map[string]Rule + mtx sync.RWMutex + block chan struct{} + // Notifier sends messages through alert manager + notifier *am.Notifier + + // datastore to store alert definitions + ruleDB RuleDB + + // pause all rule tasks + pause bool + logger log.Logger +} + +func defaultOptions(o *ManagerOptions) *ManagerOptions { + if o.NotifierOpts.QueueCapacity == 0 { + o.NotifierOpts.QueueCapacity = 10000 + } + if o.NotifierOpts.Timeout == 0 { + o.NotifierOpts.Timeout = 10 * time.Second + } + if o.ResendDelay == time.Duration(0) { + o.ResendDelay = 1 * time.Minute + } + return o +} + +// NewManager returns an implementation of Manager, ready to be started +// by calling the Run method. +func NewManager(o *ManagerOptions) (*Manager, error) { + + o = defaultOptions(o) + // here we just initiate notifier, it will be started + // in run() + notifier, err := am.NewNotifier(&o.NotifierOpts, nil) + if err != nil { + // todo(amol): rethink on this, the query service + // should not be down because alert manager is not available + return nil, err + } + + db := newRuleDB(o.DBConn) + + m := &Manager{ + tasks: map[string]Task{}, + rules: map[string]Rule{}, + notifier: notifier, + ruleDB: db, + opts: o, + block: make(chan struct{}), + logger: o.Logger, + } + return m, nil +} + +func (m *Manager) Start() { + if err := m.initiate(); err != nil { + zap.S().Errorf("failed to initialize alerting rules manager: %v", err) + } + m.run() +} + +func (m *Manager) Pause(b bool) { + m.mtx.Lock() + defer m.mtx.Unlock() + for _, t := range m.tasks { + t.Pause(b) + } +} + +func (m *Manager) initiate() error { + storedRules, err := m.ruleDB.GetStoredRules() + if err != nil { + return err + } + if len(storedRules) == 0 { + return nil + } + var loadErrors []error + + for _, rec := range storedRules { + taskName := fmt.Sprintf("%d-groupname", rec.Id) + parsedRule, errs := ParsePostableRule([]byte(rec.Data)) + + if len(errs) > 0 { + if errs[0].Error() == "failed to load json" { + zap.S().Info("failed to load rule in json format, trying yaml now:", rec.Data) + + // see if rule is stored in yaml format + parsedRule, errs = parsePostableRule([]byte(rec.Data), "yaml") + + if parsedRule == nil { + zap.S().Errorf("failed to parse and initialize yaml rule:", errs) + // just one rule is being parsed so expect just one error + loadErrors = append(loadErrors, errs[0]) + continue + } else { + // rule stored in yaml, so migrate it to json + zap.S().Info("msg:", "migrating rule from JSON to yaml", "\t rule:", rec.Data, "\t parsed rule:", parsedRule) + ruleJSON, err := json.Marshal(parsedRule) + if err == nil { + taskName, _, err := m.ruleDB.EditRuleTx(string(ruleJSON), fmt.Sprintf("%d", rec.Id)) + if err != nil { + zap.S().Errorf("msg: failed to migrate rule ", "/t error:", err) + } else { + zap.S().Info("msg:", "migrated rule from yaml to json", "/t rule:", taskName) + } + } + } + } else { + zap.S().Errorf("failed to parse and initialize rule:", errs) + // just one rule is being parsed so expect just one error + loadErrors = append(loadErrors, errs[0]) + continue + } + } + + err := m.addTask(parsedRule, taskName) + if err != nil { + zap.S().Errorf("failed to load the rule definition (%s): %v", taskName, err) + } + } + + return nil +} + +// Run starts processing of the rule manager. +func (m *Manager) run() { + // initiate notifier + go m.notifier.Run() + + // initiate blocked tasks + close(m.block) +} + +// Stop the rule manager's rule evaluation cycles. +func (m *Manager) Stop() { + m.mtx.Lock() + defer m.mtx.Unlock() + + zap.S().Info("msg: ", "Stopping rule manager...") + + for _, t := range m.tasks { + t.Stop() + } + + zap.S().Info("msg: ", "Rule manager stopped") +} + +// EditRuleDefinition writes the rule definition to the +// datastore and also updates the rule executor +func (m *Manager) EditRule(ruleStr string, id string) error { + // todo(amol): fetch recent rule from db first + parsedRule, errs := ParsePostableRule([]byte(ruleStr)) + + if len(errs) > 0 { + zap.S().Errorf("failed to parse rules:", errs) + // just one rule is being parsed so expect just one error + return errs[0] + } + + taskName, _, err := m.ruleDB.EditRuleTx(ruleStr, id) + if err != nil { + return err + } + + if !m.opts.DisableRules { + err = m.editTask(parsedRule, taskName) + if err != nil { + // todo(amol): using tx with sqllite3 is gets + // database locked. need to research and resolve this + //tx.Rollback() + return err + } + } + + // return tx.Commit() + return nil +} + +func (m *Manager) editTask(rule *PostableRule, taskName string) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + newTask, err := m.prepareTask(false, rule, taskName) + + if err != nil { + zap.S().Errorf("msg:", "loading tasks failed", "\t err:", err) + return errors.New("error preparing rule with given parameters, previous rule set restored") + } + + // If there is an old task with the same identifier, stop it and wait for + // it to finish the current iteration. Then copy it into the new group. + oldTask, ok := m.tasks[taskName] + if !ok { + zap.S().Errorf("msg:", "rule task not found, edit task failed", "\t task name:", taskName) + return errors.New("rule task not found, edit task failed") + } + + delete(m.tasks, taskName) + + if ok { + oldTask.Stop() + newTask.CopyState(oldTask) + } + go func() { + // Wait with starting evaluation until the rule manager + // is told to run. This is necessary to avoid running + // queries against a bootstrapping storage. + <-m.block + newTask.Run(m.opts.Context) + }() + + m.tasks[taskName] = newTask + return nil +} + +func (m *Manager) DeleteRule(id string) error { + + idInt, err := strconv.Atoi(id) + if err != nil { + zap.S().Errorf("msg: ", "delete rule received an rule id in invalid format, must be a number", "\t ruleid:", id) + return fmt.Errorf("delete rule received an rule id in invalid format, must be a number") + } + + taskName := prepareTaskName(int64(idInt)) + if !m.opts.DisableRules { + if err := m.deleteTask(taskName); err != nil { + zap.S().Errorf("msg: ", "failed to unload the rule task from memory, please retry", "\t ruleid: ", id) + return err + } + } + + if _, _, err := m.ruleDB.DeleteRuleTx(id); err != nil { + zap.S().Errorf("msg: ", "failed to delete the rule from rule db", "\t ruleid: ", id) + return err + } + + return nil +} + +func (m *Manager) deleteTask(taskName string) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + oldg, ok := m.tasks[taskName] + if ok { + oldg.Stop() + delete(m.tasks, taskName) + delete(m.rules, ruleIdFromTaskName(taskName)) + } else { + zap.S().Errorf("msg:", "rule not found for deletion", "\t name:", taskName) + return fmt.Errorf("rule not found") + } + + return nil +} + +// CreateRule stores rule def into db and also +// starts an executor for the rule +func (m *Manager) CreateRule(ruleStr string) error { + parsedRule, errs := ParsePostableRule([]byte(ruleStr)) + + if len(errs) > 0 { + zap.S().Errorf("failed to parse rules:", errs) + // just one rule is being parsed so expect just one error + return errs[0] + } + + taskName, tx, err := m.ruleDB.CreateRuleTx(ruleStr) + if err != nil { + return err + } + if !m.opts.DisableRules { + if err := m.addTask(parsedRule, taskName); err != nil { + tx.Rollback() + return err + } + } + return tx.Commit() +} + +func (m *Manager) addTask(rule *PostableRule, taskName string) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + newTask, err := m.prepareTask(false, rule, taskName) + + if err != nil { + zap.S().Errorf("msg:", "creating rule task failed", "\t name:", taskName, "\t err", err) + return errors.New("error loading rules, previous rule set restored") + } + + // If there is an another task with the same identifier, raise an error + _, ok := m.tasks[taskName] + if ok { + return fmt.Errorf("a rule with the same name already exists") + } + + go func() { + // Wait with starting evaluation until the rule manager + // is told to run. This is necessary to avoid running + // queries against a bootstrapping storage. + <-m.block + newTask.Run(m.opts.Context) + }() + + m.tasks[taskName] = newTask + return nil +} + +// prepareTask prepares a rule task from postable rule +func (m *Manager) prepareTask(acquireLock bool, r *PostableRule, taskName string) (Task, error) { + + if acquireLock { + m.mtx.Lock() + defer m.mtx.Unlock() + } + + rules := make([]Rule, 0) + var task Task + + if r.Alert == "" { + zap.S().Errorf("msg:", "task load failed, at least one rule must be set", "\t task name:", taskName) + return task, fmt.Errorf("task load failed, at least one rule must be set") + } + + ruleId := ruleIdFromTaskName(taskName) + if r.RuleType == RuleTypeThreshold { + // create a threshold rule + tr, err := NewThresholdRule( + ruleId, + r.Alert, + r.RuleCondition, + time.Duration(r.EvalWindow), + r.Labels, + r.Annotations, + r.Source, + ) + + if err != nil { + return task, err + } + + rules = append(rules, tr) + + // create ch rule task for evalution + task = newTask(TaskTypeCh, taskName, taskNamesuffix, time.Duration(r.Frequency), rules, m.opts, m.prepareNotifyFunc()) + + // add rule to memory + m.rules[ruleId] = tr + + } else if r.RuleType == RuleTypeProm { + + // create promql rule + pr, err := NewPromRule( + ruleId, + r.Alert, + r.RuleCondition, + time.Duration(r.EvalWindow), + r.Labels, + r.Annotations, + // required as promql engine works with logger and not zap + log.With(m.logger, "alert", r.Alert), + r.Source, + ) + + if err != nil { + return task, err + } + + rules = append(rules, pr) + + // create promql rule task for evalution + task = newTask(TaskTypeProm, taskName, taskNamesuffix, time.Duration(r.Frequency), rules, m.opts, m.prepareNotifyFunc()) + + // add rule to memory + m.rules[ruleId] = pr + + } else { + return nil, fmt.Errorf(fmt.Sprintf("unsupported rule type. Supported types: %s, %s", RuleTypeProm, RuleTypeThreshold)) + } + + return task, nil +} + +// RuleTasks returns the list of manager's rule tasks. +func (m *Manager) RuleTasks() []Task { + m.mtx.RLock() + defer m.mtx.RUnlock() + + rgs := make([]Task, 0, len(m.tasks)) + for _, g := range m.tasks { + rgs = append(rgs, g) + } + + sort.Slice(rgs, func(i, j int) bool { + return rgs[i].Name() < rgs[j].Name() + }) + + return rgs +} + +// RuleTasks returns the list of manager's rule tasks. +func (m *Manager) RuleTasksWithoutLock() []Task { + + rgs := make([]Task, 0, len(m.tasks)) + for _, g := range m.tasks { + rgs = append(rgs, g) + } + + sort.Slice(rgs, func(i, j int) bool { + return rgs[i].Name() < rgs[j].Name() + }) + + return rgs +} + +// Rules returns the list of the manager's rules. +func (m *Manager) Rules() []Rule { + m.mtx.RLock() + defer m.mtx.RUnlock() + + rules := []Rule{} + for _, r := range m.rules { + rules = append(rules, r) + } + + return rules +} + +// TriggeredAlerts returns the list of the manager's rules. +func (m *Manager) TriggeredAlerts() []*NamedAlert { + // m.mtx.RLock() + // defer m.mtx.RUnlock() + + namedAlerts := []*NamedAlert{} + + for _, r := range m.rules { + active := r.ActiveAlerts() + + for _, a := range active { + awn := &NamedAlert{ + Alert: a, + Name: r.Name(), + } + namedAlerts = append(namedAlerts, awn) + } + } + + return namedAlerts +} + +// NotifyFunc sends notifications about a set of alerts generated by the given expression. +type NotifyFunc func(ctx context.Context, expr string, alerts ...*Alert) + +// prepareNotifyFunc implements the NotifyFunc for a Notifier. +func (m *Manager) prepareNotifyFunc() NotifyFunc { + return func(ctx context.Context, expr string, alerts ...*Alert) { + var res []*am.Alert + + for _, alert := range alerts { + generatorURL := alert.GeneratorURL + if generatorURL == "" { + generatorURL = m.opts.RepoURL + } + + a := &am.Alert{ + StartsAt: alert.FiredAt, + Labels: alert.Labels, + Annotations: alert.Annotations, + GeneratorURL: generatorURL, + } + if !alert.ResolvedAt.IsZero() { + a.EndsAt = alert.ResolvedAt + } else { + a.EndsAt = alert.ValidUntil + } + res = append(res, a) + } + + if len(alerts) > 0 { + m.notifier.Send(res...) + } + } +} + +func (m *Manager) ListActiveRules() ([]Rule, error) { + ruleList := []Rule{} + + for _, r := range m.rules { + ruleList = append(ruleList, r) + } + + return ruleList, nil +} + +func (m *Manager) ListRuleStates() (*GettableRules, error) { + + // fetch rules from DB + storedRules, err := m.ruleDB.GetStoredRules() + + // initiate response object + resp := make([]*GettableRule, 0) + + for _, s := range storedRules { + + ruleResponse := &GettableRule{} + if err := json.Unmarshal([]byte(s.Data), ruleResponse); err != nil { // Parse []byte to go struct pointer + zap.S().Errorf("msg:", "invalid rule data", "\t err:", err) + continue + } + + ruleResponse.Id = fmt.Sprintf("%d", s.Id) + + // fetch state of rule from memory + if rm, ok := m.rules[ruleResponse.Id]; !ok { + zap.S().Warnf("msg:", "invalid rule id found while fetching list of rules", "\t err:", err, "\t rule_id:", ruleResponse.Id) + } else { + ruleResponse.State = rm.State().String() + } + resp = append(resp, ruleResponse) + } + + return &GettableRules{Rules: resp}, nil +} + +func (m *Manager) GetRule(id string) (*GettableRule, error) { + s, err := m.ruleDB.GetStoredRule(id) + if err != nil { + return nil, err + } + r := &GettableRule{} + if err := json.Unmarshal([]byte(s.Data), r); err != nil { + return nil, err + } + r.Id = fmt.Sprintf("%d", s.Id) + return r, nil +} diff --git a/pkg/query-service/rules/manager_test.go b/pkg/query-service/rules/manager_test.go new file mode 100644 index 0000000000..e7b059dda9 --- /dev/null +++ b/pkg/query-service/rules/manager_test.go @@ -0,0 +1,155 @@ +package rules + +import ( + "context" + "fmt" + "os" + "os/signal" + "syscall" + + "github.com/jmoiron/sqlx" + _ "github.com/mattn/go-sqlite3" + "go.signoz.io/query-service/app/clickhouseReader" + am "go.signoz.io/query-service/integrations/alertManager" + "go.signoz.io/query-service/model" + pqle "go.signoz.io/query-service/pqlEngine" + "go.signoz.io/query-service/utils/value" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "net/url" + "testing" + "time" +) + +func initZapLog() *zap.Logger { + config := zap.NewDevelopmentConfig() + config.EncoderConfig.EncodeLevel = zapcore.CapitalColorLevelEncoder + config.EncoderConfig.TimeKey = "timestamp" + config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder + logger, _ := config.Build() + return logger +} + +func TestRules(t *testing.T) { + fmt.Println("starting test TestRules..") + loggerMgr := initZapLog() + zap.ReplaceGlobals(loggerMgr) + defer loggerMgr.Sync() // flushes buffer, if any + + logger := loggerMgr.Sugar() + + configFile := "../config/prometheus.yml" + // create engine + pqle, err := pqle.FromConfigPath(configFile) + if err != nil { + fmt.Println("failed to create pql:", err) + t.Errorf("failed to create pql engine : %v", err) + } + + // create db conn + db, err := sqlx.Open("sqlite3", "../signoz.db") + if err != nil { + fmt.Println("failed to create db conn:", err) + t.Errorf("failed to create db conn: %v", err) + } + + // create ch reader + ch := clickhouseReader.NewReader(db, configFile) + + // notifier opts + notifierOpts := am.NotifierOptions{ + QueueCapacity: 10000, + Timeout: 1 * time.Second, + AlertManagerURLs: []string{"http://localhost:9093/api/"}, + } + + externalURL, _ := url.Parse("http://signoz.io") + + // create manager opts + managerOpts := &ManagerOptions{ + NotifierOpts: notifierOpts, + Queriers: &Queriers{ + PqlEngine: pqle, + Ch: ch, + }, + ExternalURL: externalURL, + Conn: db, + Context: context.Background(), + Logger: nil, + } + + // create Manager + manager, err := NewManager(managerOpts) + if err != nil { + fmt.Println("manager error:", err) + t.Errorf("manager error: %v", err) + } + fmt.Println("manager is ready:", manager) + + manager.run() + + // test rules + // create promql rule + /* promql rule + postableRule := PostableRule{ + Alert: "test alert 1 - promql", + RuleType: RuleTypeProm, + EvalWindow: 5 * time.Minute, + Frequency: 30 * time.Second, + RuleCondition: RuleCondition{ + CompositeMetricQuery: &model.CompositeMetricQuery{ + QueryType: model.PROM, + PromQueries: map[string]*model.PromQuery{ + "A": &model.PromQuery{Query: `sum(signoz_latency_count{span_kind="SPAN_KIND_SERVER"}) by (service_name) > 100`}, + }, + }, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + }*/ + // create builder rule + metricQuery := &model.MetricQuery{ + QueryName: "A", + MetricName: "signoz_latency_count", + TagFilters: &model.FilterSet{Operation: "AND", Items: []model.FilterItem{ + {Key: "span_kind", Value: "SPAN_KIND_SERVER", Operation: "neq"}, + }}, + GroupingTags: []string{"service_name"}, + AggregateOperator: model.RATE_SUM, + Expression: "A", + } + + postableRule := PostableRule{ + Alert: "test alert 2 - builder", + RuleType: RuleTypeThreshold, + EvalWindow: 5 * time.Minute, + Frequency: 30 * time.Second, + RuleCondition: RuleCondition{ + Target: value.Float64(500), + CompareOp: TargetIsMore, + CompositeMetricQuery: &model.CompositeMetricQuery{ + QueryType: model.QUERY_BUILDER, + BuilderQueries: map[string]*model.MetricQuery{ + "A": metricQuery, + }, + }, + }, + Labels: map[string]string{"host": "server1"}, + Annotations: map[string]string{}, + } + err = manager.addTask(&postableRule, postableRule.Alert) + if err != nil { + fmt.Println("failed to add rule: ", err) + t.Errorf("failed to add rule") + } + + signalsChannel := make(chan os.Signal, 1) + signal.Notify(signalsChannel, os.Interrupt, syscall.SIGTERM) + + for { + select { + case <-signalsChannel: + logger.Fatal("Received OS Interrupt Signal ... ") + } + } +} diff --git a/pkg/query-service/rules/promRule.go b/pkg/query-service/rules/promRule.go new file mode 100644 index 0000000000..669d6e3845 --- /dev/null +++ b/pkg/query-service/rules/promRule.go @@ -0,0 +1,445 @@ +package rules + +import ( + "context" + "fmt" + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "go.uber.org/zap" + "sync" + "time" + + plabels "github.com/prometheus/prometheus/pkg/labels" + pql "github.com/prometheus/prometheus/promql" + "go.signoz.io/query-service/model" + qslabels "go.signoz.io/query-service/utils/labels" + "go.signoz.io/query-service/utils/times" + "go.signoz.io/query-service/utils/timestamp" + yaml "gopkg.in/yaml.v2" +) + +type PromRule struct { + id string + name string + source string + ruleCondition *RuleCondition + + evalWindow time.Duration + holdDuration time.Duration + labels plabels.Labels + annotations plabels.Labels + + mtx sync.Mutex + evaluationDuration time.Duration + evaluationTimestamp time.Time + + health RuleHealth + + lastError error + + // map of active alerts + active map[uint64]*Alert + + logger log.Logger +} + +func NewPromRule( + id string, + name string, + ruleCondition *RuleCondition, + evalWindow time.Duration, + labels, annotations map[string]string, + logger log.Logger, + source string, +) (*PromRule, error) { + + if int64(evalWindow) == 0 { + evalWindow = 5 * time.Minute + } + + if ruleCondition == nil { + return nil, fmt.Errorf("no rule condition") + } else if !ruleCondition.IsValid() { + return nil, fmt.Errorf("invalid rule condition") + } + + zap.S().Info("msg:", "creating new alerting rule", "\t name:", name, "\t condition:", ruleCondition.String()) + + return &PromRule{ + id: id, + name: name, + source: source, + ruleCondition: ruleCondition, + evalWindow: evalWindow, + labels: plabels.FromMap(labels), + annotations: plabels.FromMap(annotations), + health: HealthUnknown, + active: map[uint64]*Alert{}, + logger: logger, + }, nil +} + +func (r *PromRule) Name() string { + return r.name +} + +func (r *PromRule) ID() string { + return r.id +} + +func (r *PromRule) Condition() *RuleCondition { + return r.ruleCondition +} + +func (r *PromRule) Type() RuleType { + return RuleTypeProm +} + +func (r *PromRule) GeneratorURL() string { + return r.source +} + +func (r *PromRule) SetLastError(err error) { + r.mtx.Lock() + defer r.mtx.Unlock() + r.lastError = err +} + +func (r *PromRule) LastError() error { + r.mtx.Lock() + defer r.mtx.Unlock() + return r.lastError +} + +func (r *PromRule) SetHealth(health RuleHealth) { + r.mtx.Lock() + defer r.mtx.Unlock() + r.health = health +} + +func (r *PromRule) Health() RuleHealth { + r.mtx.Lock() + defer r.mtx.Unlock() + return r.health +} + +// SetEvaluationDuration updates evaluationDuration to the duration it took to evaluate the rule on its last evaluation. +func (r *PromRule) SetEvaluationDuration(dur time.Duration) { + r.mtx.Lock() + defer r.mtx.Unlock() + r.evaluationDuration = dur +} + +func (r *PromRule) HoldDuration() time.Duration { + return r.holdDuration +} + +func (r *PromRule) EvalWindow() time.Duration { + return r.evalWindow +} + +// Labels returns the labels of the alerting rule. +func (r *PromRule) Labels() qslabels.BaseLabels { + return r.labels +} + +// Annotations returns the annotations of the alerting rule. +func (r *PromRule) Annotations() qslabels.BaseLabels { + return r.annotations +} + +func (r *PromRule) sample(alert *Alert, ts time.Time) pql.Sample { + lb := plabels.NewBuilder(r.labels) + + alertLabels := alert.Labels.(plabels.Labels) + for _, l := range alertLabels { + lb.Set(l.Name, l.Value) + } + + lb.Set(qslabels.MetricNameLabel, alertMetricName) + lb.Set(qslabels.AlertNameLabel, r.name) + lb.Set(qslabels.AlertStateLabel, alert.State.String()) + + s := pql.Sample{ + Metric: lb.Labels(), + Point: pql.Point{T: timestamp.FromTime(ts), V: 1}, + } + return s +} + +// forStateSample returns the sample for ALERTS_FOR_STATE. +func (r *PromRule) forStateSample(alert *Alert, ts time.Time, v float64) pql.Sample { + lb := plabels.NewBuilder(r.labels) + alertLabels := alert.Labels.(plabels.Labels) + for _, l := range alertLabels { + lb.Set(l.Name, l.Value) + } + + lb.Set(plabels.MetricName, alertForStateMetricName) + lb.Set(plabels.AlertName, r.name) + + s := pql.Sample{ + Metric: lb.Labels(), + Point: pql.Point{T: timestamp.FromTime(ts), V: v}, + } + return s +} + +// GetEvaluationDuration returns the time in seconds it took to evaluate the alerting rule. +func (r *PromRule) GetEvaluationDuration() time.Duration { + r.mtx.Lock() + defer r.mtx.Unlock() + return r.evaluationDuration +} + +// SetEvaluationTimestamp updates evaluationTimestamp to the timestamp of when the rule was last evaluated. +func (r *PromRule) SetEvaluationTimestamp(ts time.Time) { + r.mtx.Lock() + defer r.mtx.Unlock() + r.evaluationTimestamp = ts +} + +// GetEvaluationTimestamp returns the time the evaluation took place. +func (r *PromRule) GetEvaluationTimestamp() time.Time { + r.mtx.Lock() + defer r.mtx.Unlock() + return r.evaluationTimestamp +} + +// State returns the maximum state of alert instances for this rule. +// StateFiring > StatePending > StateInactive +func (r *PromRule) State() AlertState { + r.mtx.Lock() + defer r.mtx.Unlock() + + maxState := StateInactive + for _, a := range r.active { + if a.State > maxState { + maxState = a.State + } + } + return maxState +} + +func (r *PromRule) currentAlerts() []*Alert { + r.mtx.Lock() + defer r.mtx.Unlock() + + alerts := make([]*Alert, 0, len(r.active)) + + for _, a := range r.active { + anew := *a + alerts = append(alerts, &anew) + } + return alerts +} + +func (r *PromRule) ActiveAlerts() []*Alert { + var res []*Alert + for _, a := range r.currentAlerts() { + if a.ResolvedAt.IsZero() { + res = append(res, a) + } + } + return res +} + +// ForEachActiveAlert runs the given function on each alert. +// This should be used when you want to use the actual alerts from the ThresholdRule +// and not on its copy. +// If you want to run on a copy of alerts then don't use this, get the alerts from 'ActiveAlerts()'. +func (r *PromRule) ForEachActiveAlert(f func(*Alert)) { + r.mtx.Lock() + defer r.mtx.Unlock() + + for _, a := range r.active { + f(a) + } +} + +func (r *PromRule) SendAlerts(ctx context.Context, ts time.Time, resendDelay time.Duration, interval time.Duration, notifyFunc NotifyFunc) { + alerts := []*Alert{} + r.ForEachActiveAlert(func(alert *Alert) { + if alert.needsSending(ts, resendDelay) { + alert.LastSentAt = ts + // Allow for two Eval or Alertmanager send failures. + delta := resendDelay + if interval > resendDelay { + delta = interval + } + alert.ValidUntil = ts.Add(4 * delta) + anew := *alert + alerts = append(alerts, &anew) + } + }) + notifyFunc(ctx, "", alerts...) +} + +func (r *PromRule) getPqlQuery() (string, error) { + + if r.ruleCondition.CompositeMetricQuery.QueryType == model.PROM { + if len(r.ruleCondition.CompositeMetricQuery.PromQueries) > 0 { + if promQuery, ok := r.ruleCondition.CompositeMetricQuery.PromQueries["A"]; ok { + query := promQuery.Query + if query == "" { + return query, fmt.Errorf("a promquery needs to be set for this rule to function") + } + + if r.ruleCondition.Target != nil && r.ruleCondition.CompareOp != CompareOpNone { + query = fmt.Sprintf("%s %s %f", query, ResolveCompareOp(r.ruleCondition.CompareOp), *r.ruleCondition.Target) + return query, nil + } else { + return query, nil + } + } + } + } + + return "", fmt.Errorf("invalid promql rule query") +} + +func (r *PromRule) Eval(ctx context.Context, ts time.Time, queriers *Queriers) (interface{}, error) { + + q, err := r.getPqlQuery() + if err != nil { + return nil, err + } + zap.S().Info("rule:", r.Name(), "\t evaluating promql query: ", q) + res, err := queriers.PqlEngine.RunAlertQuery(ctx, q, ts) + if err != nil { + r.SetHealth(HealthBad) + r.SetLastError(err) + return nil, err + } + + r.mtx.Lock() + defer r.mtx.Unlock() + + resultFPs := map[uint64]struct{}{} + var vec pql.Vector + var alerts = make(map[uint64]*Alert, len(res)) + + for _, smpl := range res { + l := make(map[string]string, len(smpl.Metric)) + for _, lbl := range smpl.Metric { + l[lbl.Name] = lbl.Value + } + + tmplData := AlertTemplateData(l, smpl.V) + // Inject some convenience variables that are easier to remember for users + // who are not used to Go's templating system. + defs := "{{$labels := .Labels}}{{$value := .Value}}" + + expand := func(text string) string { + + tmpl := NewTemplateExpander( + ctx, + defs+text, + "__alert_"+r.Name(), + tmplData, + times.Time(timestamp.FromTime(ts)), + nil, + ) + result, err := tmpl.Expand() + if err != nil { + result = fmt.Sprintf("", err) + level.Warn(r.logger).Log("msg", "Expanding alert template failed", "err", err, "data", tmplData) + } + return result + } + + lb := plabels.NewBuilder(smpl.Metric).Del(plabels.MetricName) + + for _, l := range r.labels { + lb.Set(l.Name, expand(l.Value)) + } + lb.Set(qslabels.AlertNameLabel, r.Name()) + lb.Set(qslabels.AlertRuleIdLabel, r.ID()) + lb.Set(qslabels.RuleSourceLabel, r.GeneratorURL()) + + annotations := make(plabels.Labels, 0, len(r.annotations)) + for _, a := range r.annotations { + annotations = append(annotations, plabels.Label{Name: a.Name, Value: expand(a.Value)}) + } + + lbs := lb.Labels() + h := lbs.Hash() + resultFPs[h] = struct{}{} + + if _, ok := alerts[h]; ok { + err = fmt.Errorf("vector contains metrics with the same labelset after applying alert labels") + // We have already acquired the lock above hence using SetHealth and + // SetLastError will deadlock. + r.health = HealthBad + r.lastError = err + return nil, err + } + + alerts[h] = &Alert{ + Labels: lbs, + Annotations: annotations, + ActiveAt: ts, + State: StatePending, + Value: smpl.V, + GeneratorURL: r.GeneratorURL(), + } + } + + // alerts[h] is ready, add or update active list now + for h, a := range alerts { + // Check whether we already have alerting state for the identifying label set. + // Update the last value and annotations if so, create a new alert entry otherwise. + if alert, ok := r.active[h]; ok && alert.State != StateInactive { + alert.Value = a.Value + alert.Annotations = a.Annotations + continue + } + + r.active[h] = a + + } + + // Check if any pending alerts should be removed or fire now. Write out alert timeseries. + for fp, a := range r.active { + if _, ok := resultFPs[fp]; !ok { + // If the alert was previously firing, keep it around for a given + // retention time so it is reported as resolved to the AlertManager. + if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) { + delete(r.active, fp) + } + if a.State != StateInactive { + a.State = StateInactive + a.ResolvedAt = ts + } + continue + } + + if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration { + a.State = StateFiring + a.FiredAt = ts + } + + } + r.health = HealthGood + r.lastError = err + return vec, nil + +} + +func (r *PromRule) String() string { + + ar := PostableRule{ + Alert: r.name, + RuleCondition: r.ruleCondition, + EvalWindow: Duration(r.evalWindow), + Labels: r.labels.Map(), + Annotations: r.annotations.Map(), + } + + byt, err := yaml.Marshal(ar) + if err != nil { + return fmt.Sprintf("error marshaling alerting rule: %s", err.Error()) + } + + return string(byt) +} diff --git a/pkg/query-service/rules/promRuleTask.go b/pkg/query-service/rules/promRuleTask.go new file mode 100644 index 0000000000..c06d3e1135 --- /dev/null +++ b/pkg/query-service/rules/promRuleTask.go @@ -0,0 +1,370 @@ +package rules + +import ( + "context" + "fmt" + "github.com/go-kit/log" + opentracing "github.com/opentracing/opentracing-go" + plabels "github.com/prometheus/prometheus/pkg/labels" + pql "github.com/prometheus/prometheus/promql" + "go.uber.org/zap" + "sort" + "sync" + "time" +) + +// PromRuleTask is a promql rule executor +type PromRuleTask struct { + name string + file string + frequency time.Duration + rules []Rule + seriesInPreviousEval []map[string]plabels.Labels // One per Rule. + staleSeries []plabels.Labels + opts *ManagerOptions + mtx sync.Mutex + evaluationDuration time.Duration + evaluationTime time.Duration + lastEvaluation time.Time + + markStale bool + done chan struct{} + terminated chan struct{} + managerDone chan struct{} + + pause bool + logger log.Logger + notify NotifyFunc +} + +// newPromRuleTask holds rules that have promql condition +// and evalutes the rule at a given frequency +func newPromRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc) *PromRuleTask { + zap.S().Info("Initiating a new rule group:", name, "\t frequency:", frequency) + + if time.Now() == time.Now().Add(frequency) { + frequency = DefaultFrequency + } + + return &PromRuleTask{ + name: name, + file: file, + pause: false, + frequency: frequency, + rules: rules, + opts: opts, + seriesInPreviousEval: make([]map[string]plabels.Labels, len(rules)), + done: make(chan struct{}), + terminated: make(chan struct{}), + notify: notify, + logger: log.With(opts.Logger, "group", name), + } +} + +// Name returns the group name. +func (g *PromRuleTask) Name() string { return g.name } + +// Key returns the group key +func (g *PromRuleTask) Key() string { + return g.name + ";" + g.file +} + +func (g *PromRuleTask) Type() TaskType { return TaskTypeProm } + +// Rules returns the group's rules. +func (g *PromRuleTask) Rules() []Rule { return g.rules } + +// Interval returns the group's interval. +func (g *PromRuleTask) Interval() time.Duration { return g.frequency } + +func (g *PromRuleTask) Pause(b bool) { + g.mtx.Lock() + defer g.mtx.Unlock() + g.pause = b +} + +func (g *PromRuleTask) Run(ctx context.Context) { + defer close(g.terminated) + + // Wait an initial amount to have consistently slotted intervals. + evalTimestamp := g.EvalTimestamp(time.Now().UnixNano()).Add(g.frequency) + select { + case <-time.After(time.Until(evalTimestamp)): + case <-g.done: + return + } + + ctx = NewQueryOriginContext(ctx, map[string]interface{}{ + "ruleGroup": map[string]string{ + "name": g.Name(), + }, + }) + + iter := func() { + + start := time.Now() + g.Eval(ctx, evalTimestamp) + timeSinceStart := time.Since(start) + + g.setEvaluationTime(timeSinceStart) + g.setLastEvaluation(start) + } + + // The assumption here is that since the ticker was started after having + // waited for `evalTimestamp` to pass, the ticks will trigger soon + // after each `evalTimestamp + N * g.frequency` occurrence. + tick := time.NewTicker(g.frequency) + defer tick.Stop() + + // defer cleanup + defer func() { + if !g.markStale { + return + } + go func(now time.Time) { + for _, rule := range g.seriesInPreviousEval { + for _, r := range rule { + g.staleSeries = append(g.staleSeries, r) + } + } + // That can be garbage collected at this point. + g.seriesInPreviousEval = nil + + }(time.Now()) + + }() + + iter() + + // let the group iterate and run + for { + select { + case <-g.done: + return + default: + select { + case <-g.done: + return + case <-tick.C: + missed := (time.Since(evalTimestamp) / g.frequency) - 1 + evalTimestamp = evalTimestamp.Add((missed + 1) * g.frequency) + iter() + } + } + } +} + +func (g *PromRuleTask) Stop() { + close(g.done) + <-g.terminated +} + +func (g *PromRuleTask) hash() uint64 { + l := plabels.New( + plabels.Label{Name: "name", Value: g.name}, + ) + return l.Hash() +} + +// PromRules returns the list of the group's promql rules. +func (g *PromRuleTask) PromRules() []*PromRule { + g.mtx.Lock() + defer g.mtx.Unlock() + var alerts []*PromRule + for _, rule := range g.rules { + if tr, ok := rule.(*PromRule); ok { + alerts = append(alerts, tr) + } + } + sort.Slice(alerts, func(i, j int) bool { + return alerts[i].State() > alerts[j].State() || + (alerts[i].State() == alerts[j].State() && + alerts[i].Name() < alerts[j].Name()) + }) + return alerts +} + +// HasAlertingRules returns true if the group contains at least one AlertingRule. +func (g *PromRuleTask) HasAlertingRules() bool { + g.mtx.Lock() + defer g.mtx.Unlock() + + for _, rule := range g.rules { + if _, ok := rule.(*ThresholdRule); ok { + return true + } + } + return false +} + +// GetEvaluationDuration returns the time in seconds it took to evaluate the rule group. +func (g *PromRuleTask) GetEvaluationDuration() time.Duration { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.evaluationDuration +} + +// SetEvaluationDuration sets the time in seconds the last evaluation took. +func (g *PromRuleTask) SetEvaluationDuration(dur time.Duration) { + g.mtx.Lock() + defer g.mtx.Unlock() + g.evaluationDuration = dur +} + +// GetEvaluationTime returns the time in seconds it took to evaluate the rule group. +func (g *PromRuleTask) GetEvaluationTime() time.Duration { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.evaluationTime +} + +// setEvaluationTime sets the time in seconds the last evaluation took. +func (g *PromRuleTask) setEvaluationTime(dur time.Duration) { + g.mtx.Lock() + defer g.mtx.Unlock() + g.evaluationTime = dur +} + +// GetLastEvaluation returns the time the last evaluation of the rule group took place. +func (g *PromRuleTask) GetLastEvaluation() time.Time { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.lastEvaluation +} + +// setLastEvaluation updates evaluationTimestamp to the timestamp of when the rule group was last evaluated. +func (g *PromRuleTask) setLastEvaluation(ts time.Time) { + g.mtx.Lock() + defer g.mtx.Unlock() + g.lastEvaluation = ts +} + +// EvalTimestamp returns the immediately preceding consistently slotted evaluation time. +func (g *PromRuleTask) EvalTimestamp(startTime int64) time.Time { + var ( + offset = int64(g.hash() % uint64(g.frequency)) + adjNow = startTime - offset + base = adjNow - (adjNow % int64(g.frequency)) + ) + + return time.Unix(0, base+offset).UTC() +} + +// CopyState copies the alerting rule and staleness related state from the given group. +// +// Rules are matched based on their name and labels. If there are duplicates, the +// first is matched with the first, second with the second etc. +func (g *PromRuleTask) CopyState(fromTask Task) error { + + from, ok := fromTask.(*PromRuleTask) + if !ok { + return fmt.Errorf("you can only copy rule groups with same type") + } + + g.evaluationTime = from.evaluationTime + g.lastEvaluation = from.lastEvaluation + + ruleMap := make(map[string][]int, len(from.rules)) + + for fi, fromRule := range from.rules { + nameAndLabels := nameAndLabels(fromRule) + l := ruleMap[nameAndLabels] + ruleMap[nameAndLabels] = append(l, fi) + } + + for i, rule := range g.rules { + nameAndLabels := nameAndLabels(rule) + indexes := ruleMap[nameAndLabels] + if len(indexes) == 0 { + continue + } + fi := indexes[0] + g.seriesInPreviousEval[i] = from.seriesInPreviousEval[fi] + ruleMap[nameAndLabels] = indexes[1:] + + ar, ok := rule.(*ThresholdRule) + if !ok { + continue + } + far, ok := from.rules[fi].(*ThresholdRule) + if !ok { + continue + } + + for fp, a := range far.active { + ar.active[fp] = a + } + } + + // Handle deleted and unmatched duplicate rules. + g.staleSeries = from.staleSeries + for fi, fromRule := range from.rules { + nameAndLabels := nameAndLabels(fromRule) + l := ruleMap[nameAndLabels] + if len(l) != 0 { + for _, series := range from.seriesInPreviousEval[fi] { + g.staleSeries = append(g.staleSeries, series) + } + } + } + return nil +} + +// Eval runs a single evaluation cycle in which all rules are evaluated sequentially. +func (g *PromRuleTask) Eval(ctx context.Context, ts time.Time) { + zap.S().Info("promql rule task:", g.name, "\t eval started at:", ts) + var samplesTotal float64 + for i, rule := range g.rules { + if rule == nil { + continue + } + select { + case <-g.done: + return + default: + } + + func(i int, rule Rule) { + sp, ctx := opentracing.StartSpanFromContext(ctx, "rule") + + sp.SetTag("name", rule.Name()) + defer func(t time.Time) { + sp.Finish() + + since := time.Since(t) + rule.SetEvaluationDuration(since) + rule.SetEvaluationTimestamp(t) + }(time.Now()) + + data, err := rule.Eval(ctx, ts, g.opts.Queriers) + if err != nil { + rule.SetHealth(HealthBad) + rule.SetLastError(err) + + zap.S().Warn("msg", "Evaluating rule failed", "rule", rule, "err", err) + + // Canceled queries are intentional termination of queries. This normally + // happens on shutdown and thus we skip logging of any errors here. + //! if _, ok := err.(promql.ErrQueryCanceled); !ok { + // level.Warn(g.logger).Log("msg", "Evaluating rule failed", "rule", rule, "err", err) + //} + return + } + vector := data.(pql.Vector) + samplesTotal += float64(len(vector)) + + rule.SendAlerts(ctx, ts, g.opts.ResendDelay, g.frequency, g.notify) + + seriesReturned := make(map[string]plabels.Labels, len(g.seriesInPreviousEval[i])) + + defer func() { + g.seriesInPreviousEval[i] = seriesReturned + }() + + for _, s := range vector { + seriesReturned[s.Metric.String()] = s.Metric + } + + }(i, rule) + } +} diff --git a/pkg/query-service/rules/queriers.go b/pkg/query-service/rules/queriers.go new file mode 100644 index 0000000000..c2444cff7a --- /dev/null +++ b/pkg/query-service/rules/queriers.go @@ -0,0 +1,21 @@ +package rules + +import ( + "github.com/ClickHouse/clickhouse-go/v2" + pqle "go.signoz.io/query-service/pqlEngine" +) + +// Queriers register the options for querying metrics or event sources +// which return a condition that results in a alert. Currently we support +// promql engine and clickhouse queries but in future we may include +// api readers for Machine Learning (ML) use cases. +// Note: each rule will pick up the querier it is interested in +// and use it. This allows rules to have flexibility in choosing +// the query engines. +type Queriers struct { + // promql engine + PqlEngine *pqle.PqlEngine + + // metric querier + Ch clickhouse.Conn +} diff --git a/pkg/query-service/rules/resultTypes.go b/pkg/query-service/rules/resultTypes.go new file mode 100644 index 0000000000..9a36a9759f --- /dev/null +++ b/pkg/query-service/rules/resultTypes.go @@ -0,0 +1,50 @@ +package rules + +import ( + "encoding/json" + "fmt" + "strconv" + + "go.signoz.io/query-service/utils/labels" +) + +// common result format of query + +type Vector []Sample + +type Sample struct { + Point + + Metric labels.Labels +} + +func (s Sample) String() string { + return fmt.Sprintf("%s => %s", s.Metric, s.Point) +} + +func (s Sample) MarshalJSON() ([]byte, error) { + v := struct { + M labels.Labels `json:"metric"` + V Point `json:"value"` + }{ + M: s.Metric, + V: s.Point, + } + return json.Marshal(v) +} + +type Point struct { + T int64 + V float64 +} + +func (p Point) String() string { + v := strconv.FormatFloat(p.V, 'f', -1, 64) + return fmt.Sprintf("%v @[%v]", v, p.T) +} + +// MarshalJSON implements json.Marshaler. +func (p Point) MarshalJSON() ([]byte, error) { + v := strconv.FormatFloat(p.V, 'f', -1, 64) + return json.Marshal([...]interface{}{float64(p.T) / 1000, v}) +} diff --git a/pkg/query-service/rules/rule.go b/pkg/query-service/rules/rule.go new file mode 100644 index 0000000000..ba5c934172 --- /dev/null +++ b/pkg/query-service/rules/rule.go @@ -0,0 +1,35 @@ +package rules + +import ( + "context" + "go.signoz.io/query-service/utils/labels" + "time" +) + +// A Rule encapsulates a vector expression which is evaluated at a specified +// interval and acted upon (currently used for alerting). +type Rule interface { + ID() string + Name() string + Type() RuleType + + Labels() labels.BaseLabels + Annotations() labels.BaseLabels + Condition() *RuleCondition + State() AlertState + ActiveAlerts() []*Alert + + Eval(context.Context, time.Time, *Queriers) (interface{}, error) + String() string + // Query() string + SetLastError(error) + LastError() error + SetHealth(RuleHealth) + Health() RuleHealth + SetEvaluationDuration(time.Duration) + GetEvaluationDuration() time.Duration + SetEvaluationTimestamp(time.Time) + GetEvaluationTimestamp() time.Time + + SendAlerts(ctx context.Context, ts time.Time, resendDelay time.Duration, interval time.Duration, notifyFunc NotifyFunc) +} diff --git a/pkg/query-service/rules/ruleTask.go b/pkg/query-service/rules/ruleTask.go new file mode 100644 index 0000000000..59b25f05e0 --- /dev/null +++ b/pkg/query-service/rules/ruleTask.go @@ -0,0 +1,385 @@ +package rules + +import ( + "context" + "fmt" + opentracing "github.com/opentracing/opentracing-go" + "go.signoz.io/query-service/utils/labels" + "go.uber.org/zap" + "sort" + "sync" + "time" +) + +// RuleTask holds a rule (with composite queries) +// and evaluates the rule at a given frequency +type RuleTask struct { + name string + file string + frequency time.Duration + rules []Rule + seriesInPreviousEval []map[string]labels.Labels // One per Rule. + staleSeries []labels.Labels + opts *ManagerOptions + mtx sync.Mutex + evaluationDuration time.Duration + evaluationTime time.Duration + lastEvaluation time.Time + + markStale bool + done chan struct{} + terminated chan struct{} + managerDone chan struct{} + + pause bool + notify NotifyFunc +} + +const DefaultFrequency = 1 * time.Minute + +// newRuleTask makes a new RuleTask with the given name, options, and rules. +func newRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc) *RuleTask { + + if time.Now() == time.Now().Add(frequency) { + frequency = DefaultFrequency + } + zap.S().Info("msg:", "initiating a new rule task", "\t name:", name, "\t frequency:", frequency) + + return &RuleTask{ + name: name, + file: file, + pause: false, + frequency: frequency, + rules: rules, + opts: opts, + seriesInPreviousEval: make([]map[string]labels.Labels, len(rules)), + done: make(chan struct{}), + terminated: make(chan struct{}), + notify: notify, + } +} + +// Name returns the group name. +func (g *RuleTask) Name() string { return g.name } + +// Key returns the group key +func (g *RuleTask) Key() string { + return g.name + ";" + g.file +} + +// Name returns the group name. +func (g *RuleTask) Type() TaskType { return TaskTypeCh } + +// Rules returns the group's rules. +func (g *RuleTask) Rules() []Rule { return g.rules } + +// Interval returns the group's interval. +func (g *RuleTask) Interval() time.Duration { return g.frequency } + +func (g *RuleTask) Pause(b bool) { + g.mtx.Lock() + defer g.mtx.Unlock() + g.pause = b +} + +type QueryOrigin struct{} + +func NewQueryOriginContext(ctx context.Context, data map[string]interface{}) context.Context { + return context.WithValue(ctx, QueryOrigin{}, data) +} + +func (g *RuleTask) Run(ctx context.Context) { + defer close(g.terminated) + + // Wait an initial amount to have consistently slotted intervals. + evalTimestamp := g.EvalTimestamp(time.Now().UnixNano()).Add(g.frequency) + zap.S().Debugf("group:", g.name, "\t group run to begin at: ", evalTimestamp) + select { + case <-time.After(time.Until(evalTimestamp)): + case <-g.done: + return + } + + ctx = NewQueryOriginContext(ctx, map[string]interface{}{ + "ruleRuleTask": map[string]string{ + "name": g.Name(), + }, + }) + + iter := func() { + if g.pause { + // todo(amol): remove in memory active alerts + // and last series state + return + } + start := time.Now() + g.Eval(ctx, evalTimestamp) + timeSinceStart := time.Since(start) + + g.setEvaluationTime(timeSinceStart) + g.setLastEvaluation(start) + } + + // The assumption here is that since the ticker was started after having + // waited for `evalTimestamp` to pass, the ticks will trigger soon + // after each `evalTimestamp + N * g.frequency` occurrence. + tick := time.NewTicker(g.frequency) + defer tick.Stop() + + // defer cleanup + defer func() { + if !g.markStale { + return + } + go func(now time.Time) { + for _, rule := range g.seriesInPreviousEval { + for _, r := range rule { + g.staleSeries = append(g.staleSeries, r) + } + } + // That can be garbage collected at this point. + g.seriesInPreviousEval = nil + + }(time.Now()) + + }() + + iter() + + // let the group iterate and run + for { + select { + case <-g.done: + return + default: + select { + case <-g.done: + return + case <-tick.C: + missed := (time.Since(evalTimestamp) / g.frequency) - 1 + evalTimestamp = evalTimestamp.Add((missed + 1) * g.frequency) + iter() + } + } + } +} + +func (g *RuleTask) Stop() { + close(g.done) + <-g.terminated +} + +func (g *RuleTask) hash() uint64 { + l := labels.New( + labels.Label{Name: "name", Value: g.name}, + ) + return l.Hash() +} + +// ThresholdRules returns the list of the group's threshold rules. +func (g *RuleTask) ThresholdRules() []*ThresholdRule { + g.mtx.Lock() + defer g.mtx.Unlock() + var alerts []*ThresholdRule + for _, rule := range g.rules { + if tr, ok := rule.(*ThresholdRule); ok { + alerts = append(alerts, tr) + } + } + sort.Slice(alerts, func(i, j int) bool { + return alerts[i].State() > alerts[j].State() || + (alerts[i].State() == alerts[j].State() && + alerts[i].Name() < alerts[j].Name()) + }) + return alerts +} + +// HasAlertingRules returns true if the group contains at least one AlertingRule. +func (g *RuleTask) HasAlertingRules() bool { + g.mtx.Lock() + defer g.mtx.Unlock() + + for _, rule := range g.rules { + if _, ok := rule.(*ThresholdRule); ok { + return true + } + } + return false +} + +// GetEvaluationDuration returns the time in seconds it took to evaluate the rule group. +func (g *RuleTask) GetEvaluationDuration() time.Duration { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.evaluationDuration +} + +// SetEvaluationDuration sets the time in seconds the last evaluation took. +func (g *RuleTask) SetEvaluationDuration(dur time.Duration) { + g.mtx.Lock() + defer g.mtx.Unlock() + g.evaluationDuration = dur +} + +// GetEvaluationTime returns the time in seconds it took to evaluate the rule group. +func (g *RuleTask) GetEvaluationTime() time.Duration { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.evaluationTime +} + +// setEvaluationTime sets the time in seconds the last evaluation took. +func (g *RuleTask) setEvaluationTime(dur time.Duration) { + g.mtx.Lock() + defer g.mtx.Unlock() + g.evaluationTime = dur +} + +// GetLastEvaluation returns the time the last evaluation of the rule group took place. +func (g *RuleTask) GetLastEvaluation() time.Time { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.lastEvaluation +} + +// setLastEvaluation updates evaluationTimestamp to the timestamp of when the rule group was last evaluated. +func (g *RuleTask) setLastEvaluation(ts time.Time) { + g.mtx.Lock() + defer g.mtx.Unlock() + g.lastEvaluation = ts +} + +// EvalTimestamp returns the immediately preceding consistently slotted evaluation time. +func (g *RuleTask) EvalTimestamp(startTime int64) time.Time { + var ( + offset = int64(g.hash() % uint64(g.frequency)) + adjNow = startTime - offset + base = adjNow - (adjNow % int64(g.frequency)) + ) + + return time.Unix(0, base+offset).UTC() +} + +func nameAndLabels(rule Rule) string { + return rule.Name() + rule.Labels().String() +} + +// CopyState copies the alerting rule and staleness related state from the given group. +// +// Rules are matched based on their name and labels. If there are duplicates, the +// first is matched with the first, second with the second etc. +func (g *RuleTask) CopyState(fromTask Task) error { + + from, ok := fromTask.(*RuleTask) + if !ok { + return fmt.Errorf("invalid from task for copy") + } + g.evaluationTime = from.evaluationTime + g.lastEvaluation = from.lastEvaluation + + ruleMap := make(map[string][]int, len(from.rules)) + + for fi, fromRule := range from.rules { + nameAndLabels := nameAndLabels(fromRule) + l := ruleMap[nameAndLabels] + ruleMap[nameAndLabels] = append(l, fi) + } + + for i, rule := range g.rules { + nameAndLabels := nameAndLabels(rule) + indexes := ruleMap[nameAndLabels] + if len(indexes) == 0 { + continue + } + fi := indexes[0] + g.seriesInPreviousEval[i] = from.seriesInPreviousEval[fi] + ruleMap[nameAndLabels] = indexes[1:] + + // todo(amol): support other rules too here + ar, ok := rule.(*ThresholdRule) + if !ok { + continue + } + far, ok := from.rules[fi].(*ThresholdRule) + if !ok { + continue + } + + for fp, a := range far.active { + ar.active[fp] = a + } + } + + // Handle deleted and unmatched duplicate rules. + // todo(amol): possibly not needed any more + g.staleSeries = from.staleSeries + for fi, fromRule := range from.rules { + nameAndLabels := nameAndLabels(fromRule) + l := ruleMap[nameAndLabels] + if len(l) != 0 { + for _, series := range from.seriesInPreviousEval[fi] { + g.staleSeries = append(g.staleSeries, series) + } + } + } + return nil +} + +// Eval runs a single evaluation cycle in which all rules are evaluated sequentially. +func (g *RuleTask) Eval(ctx context.Context, ts time.Time) { + + zap.S().Debugf("msg:", "rule task eval started", "\t name:", g.name, "\t start time:", ts) + + var samplesTotal float64 + for i, rule := range g.rules { + if rule == nil { + continue + } + select { + case <-g.done: + return + default: + } + + func(i int, rule Rule) { + sp, ctx := opentracing.StartSpanFromContext(ctx, "rule") + + sp.SetTag("name", rule.Name()) + defer func(t time.Time) { + sp.Finish() + + since := time.Since(t) + rule.SetEvaluationDuration(since) + rule.SetEvaluationTimestamp(t) + }(time.Now()) + + data, err := rule.Eval(ctx, ts, g.opts.Queriers) + if err != nil { + rule.SetHealth(HealthBad) + rule.SetLastError(err) + + zap.S().Warn("msg:", "Evaluating rule failed", "\t rule:", rule, "\t err: ", err) + + // Canceled queries are intentional termination of queries. This normally + // happens on shutdown and thus we skip logging of any errors here. + //! if _, ok := err.(promql.ErrQueryCanceled); !ok { + // level.Warn(g.logger).Log("msg", "Evaluating rule failed", "rule", rule, "err", err) + //} + return + } + + vector := data.(Vector) + samplesTotal += float64(len(vector)) + + rule.SendAlerts(ctx, ts, g.opts.ResendDelay, g.frequency, g.notify) + + seriesReturned := make(map[string]labels.Labels, len(g.seriesInPreviousEval[i])) + + for _, s := range vector { + seriesReturned[s.Metric.String()] = s.Metric + } + + g.seriesInPreviousEval[i] = seriesReturned + }(i, rule) + } +} diff --git a/pkg/query-service/rules/task.go b/pkg/query-service/rules/task.go new file mode 100644 index 0000000000..bec4ff1c13 --- /dev/null +++ b/pkg/query-service/rules/task.go @@ -0,0 +1,37 @@ +package rules + +import ( + "context" + "time" +) + +type TaskType string + +const ( + TaskTypeProm = "promql_ruletask" + TaskTypeCh = "ch_ruletask" +) + +type Task interface { + Name() string + + // Key returns the group key + Key() string + + Type() TaskType + CopyState(from Task) error + Eval(ctx context.Context, ts time.Time) + Run(ctx context.Context) + Rules() []Rule + Stop() + Pause(b bool) +} + +// newTask returns an appropriate group for +// rule type +func newTask(taskType TaskType, name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc) Task { + if taskType == TaskTypeCh { + return newRuleTask(name, file, frequency, rules, opts, notify) + } + return newPromRuleTask(name, file, frequency, rules, opts, notify) +} diff --git a/pkg/query-service/rules/templates.go b/pkg/query-service/rules/templates.go new file mode 100644 index 0000000000..4789780ffc --- /dev/null +++ b/pkg/query-service/rules/templates.go @@ -0,0 +1,290 @@ +package rules + +import ( + "bytes" + "context" + "errors" + "fmt" + "math" + "net/url" + "regexp" + "sort" + "strings" + + html_template "html/template" + text_template "text/template" + + "go.signoz.io/query-service/utils/times" +) + +type tmplQueryRecord struct { + Labels map[string]string + Value float64 +} +type tmplQueryResults []*tmplQueryRecord + +type tmplQueryResultsByLabelSorter struct { + results tmplQueryResults + by string +} + +func (q tmplQueryResultsByLabelSorter) Len() int { + return len(q.results) +} + +func (q tmplQueryResultsByLabelSorter) Less(i, j int) bool { + return q.results[i].Labels[q.by] < q.results[j].Labels[q.by] +} + +func (q tmplQueryResultsByLabelSorter) Swap(i, j int) { + q.results[i], q.results[j] = q.results[j], q.results[i] +} + +// Expander executes templates in text or HTML mode with a common set of Prometheus template functions. +type TemplateExpander struct { + text string + name string + data interface{} + funcMap text_template.FuncMap +} + +// NewTemplateExpander returns a template expander ready to use. +func NewTemplateExpander( + ctx context.Context, + text string, + name string, + data interface{}, + timestamp times.Time, + externalURL *url.URL, +) *TemplateExpander { + return &TemplateExpander{ + text: text, + name: name, + data: data, + funcMap: text_template.FuncMap{ + "first": func(v tmplQueryResults) (*tmplQueryRecord, error) { + if len(v) > 0 { + return v[0], nil + } + return nil, errors.New("first() called on vector with no elements") + }, + "label": func(label string, s *tmplQueryRecord) string { + return s.Labels[label] + }, + "value": func(s *tmplQueryRecord) float64 { + return s.Value + }, + "strvalue": func(s *tmplQueryRecord) string { + return s.Labels["__value__"] + }, + "args": func(args ...interface{}) map[string]interface{} { + result := make(map[string]interface{}) + for i, a := range args { + result[fmt.Sprintf("arg%d", i)] = a + } + return result + }, + "reReplaceAll": func(pattern, repl, text string) string { + re := regexp.MustCompile(pattern) + return re.ReplaceAllString(text, repl) + }, + "safeHtml": func(text string) html_template.HTML { + return html_template.HTML(text) + }, + "match": regexp.MatchString, + "title": strings.Title, + "toUpper": strings.ToUpper, + "toLower": strings.ToLower, + "sortByLabel": func(label string, v tmplQueryResults) tmplQueryResults { + sorter := tmplQueryResultsByLabelSorter{v[:], label} + sort.Stable(sorter) + return v + }, + "humanize": func(v float64) string { + if v == 0 || math.IsNaN(v) || math.IsInf(v, 0) { + return fmt.Sprintf("%.4g", v) + } + if math.Abs(v) >= 1 { + prefix := "" + for _, p := range []string{"k", "M", "G", "T", "P", "E", "Z", "Y"} { + if math.Abs(v) < 1000 { + break + } + prefix = p + v /= 1000 + } + return fmt.Sprintf("%.4g%s", v, prefix) + } + prefix := "" + for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} { + if math.Abs(v) >= 1 { + break + } + prefix = p + v *= 1000 + } + return fmt.Sprintf("%.4g%s", v, prefix) + }, + "humanize1024": func(v float64) string { + if math.Abs(v) <= 1 || math.IsNaN(v) || math.IsInf(v, 0) { + return fmt.Sprintf("%.4g", v) + } + prefix := "" + for _, p := range []string{"ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"} { + if math.Abs(v) < 1024 { + break + } + prefix = p + v /= 1024 + } + return fmt.Sprintf("%.4g%s", v, prefix) + }, + "humanizeDuration": func(v float64) string { + if math.IsNaN(v) || math.IsInf(v, 0) { + return fmt.Sprintf("%.4g", v) + } + if v == 0 { + return fmt.Sprintf("%.4gs", v) + } + if math.Abs(v) >= 1 { + sign := "" + if v < 0 { + sign = "-" + v = -v + } + seconds := int64(v) % 60 + minutes := (int64(v) / 60) % 60 + hours := (int64(v) / 60 / 60) % 24 + days := (int64(v) / 60 / 60 / 24) + // For days to minutes, we display seconds as an integer. + if days != 0 { + return fmt.Sprintf("%s%dd %dh %dm %ds", sign, days, hours, minutes, seconds) + } + if hours != 0 { + return fmt.Sprintf("%s%dh %dm %ds", sign, hours, minutes, seconds) + } + if minutes != 0 { + return fmt.Sprintf("%s%dm %ds", sign, minutes, seconds) + } + // For seconds, we display 4 significant digts. + return fmt.Sprintf("%s%.4gs", sign, v) + } + prefix := "" + for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} { + if math.Abs(v) >= 1 { + break + } + prefix = p + v *= 1000 + } + return fmt.Sprintf("%.4g%ss", v, prefix) + }, + "humanizeTimestamp": func(v float64) string { + if math.IsNaN(v) || math.IsInf(v, 0) { + return fmt.Sprintf("%.4g", v) + } + t := times.TimeFromUnixNano(int64(v * 1e9)).Time().UTC() + return fmt.Sprint(t) + }, + "pathPrefix": func() string { + return externalURL.Path + }, + "externalURL": func() string { + return externalURL.String() + }, + }, + } +} + +// AlertTemplateData returns the interface to be used in expanding the template. +func AlertTemplateData(labels map[string]string, value float64) interface{} { + return struct { + Labels map[string]string + Value float64 + }{ + Labels: labels, + Value: value, + } +} + +// Funcs adds the functions in fm to the Expander's function map. +// Existing functions will be overwritten in case of conflict. +func (te TemplateExpander) Funcs(fm text_template.FuncMap) { + for k, v := range fm { + te.funcMap[k] = v + } +} + +// Expand expands a template in text (non-HTML) mode. +func (te TemplateExpander) Expand() (result string, resultErr error) { + // It'd better to have no alert description than to kill the whole process + // if there's a bug in the template. + defer func() { + if r := recover(); r != nil { + var ok bool + resultErr, ok = r.(error) + if !ok { + resultErr = fmt.Errorf("panic expanding template %v: %v", te.name, r) + } + } + }() + + tmpl, err := text_template.New(te.name).Funcs(te.funcMap).Option("missingkey=zero").Parse(te.text) + if err != nil { + return "", fmt.Errorf("error parsing template %v: %v", te.name, err) + } + var buffer bytes.Buffer + err = tmpl.Execute(&buffer, te.data) + if err != nil { + return "", fmt.Errorf("error executing template %v: %v", te.name, err) + } + return buffer.String(), nil +} + +// ExpandHTML expands a template with HTML escaping, with templates read from the given files. +func (te TemplateExpander) ExpandHTML(templateFiles []string) (result string, resultErr error) { + defer func() { + if r := recover(); r != nil { + var ok bool + resultErr, ok = r.(error) + if !ok { + resultErr = fmt.Errorf("panic expanding template %v: %v", te.name, r) + } + } + }() + + tmpl := html_template.New(te.name).Funcs(html_template.FuncMap(te.funcMap)) + tmpl.Option("missingkey=zero") + tmpl.Funcs(html_template.FuncMap{ + "tmpl": func(name string, data interface{}) (html_template.HTML, error) { + var buffer bytes.Buffer + err := tmpl.ExecuteTemplate(&buffer, name, data) + return html_template.HTML(buffer.String()), err + }, + }) + tmpl, err := tmpl.Parse(te.text) + if err != nil { + return "", fmt.Errorf("error parsing template %v: %v", te.name, err) + } + if len(templateFiles) > 0 { + _, err = tmpl.ParseFiles(templateFiles...) + if err != nil { + return "", fmt.Errorf("error parsing template files for %v: %v", te.name, err) + } + } + var buffer bytes.Buffer + err = tmpl.Execute(&buffer, te.data) + if err != nil { + return "", fmt.Errorf("error executing template %v: %v", te.name, err) + } + return buffer.String(), nil +} + +// ParseTest parses the templates and returns the error if any. +func (te TemplateExpander) ParseTest() error { + _, err := text_template.New(te.name).Funcs(te.funcMap).Option("missingkey=zero").Parse(te.text) + if err != nil { + return err + } + return nil +} diff --git a/pkg/query-service/rules/thresholdRule.go b/pkg/query-service/rules/thresholdRule.go new file mode 100644 index 0000000000..8f734c113d --- /dev/null +++ b/pkg/query-service/rules/thresholdRule.go @@ -0,0 +1,679 @@ +package rules + +import ( + "context" + "fmt" + "go.uber.org/zap" + "math" + "reflect" + "sort" + "sync" + "time" + + "github.com/ClickHouse/clickhouse-go/v2" + "go.signoz.io/query-service/app/metrics" + "go.signoz.io/query-service/constants" + qsmodel "go.signoz.io/query-service/model" + "go.signoz.io/query-service/utils/labels" + "go.signoz.io/query-service/utils/times" + "go.signoz.io/query-service/utils/timestamp" + "go.signoz.io/query-service/utils/value" + + yaml "gopkg.in/yaml.v2" +) + +type ThresholdRule struct { + id string + name string + source string + ruleCondition *RuleCondition + evalWindow time.Duration + holdDuration time.Duration + labels labels.Labels + annotations labels.Labels + + mtx sync.Mutex + evaluationDuration time.Duration + evaluationTimestamp time.Time + + health RuleHealth + + lastError error + + // map of active alerts + active map[uint64]*Alert +} + +func NewThresholdRule( + id string, + name string, + ruleCondition *RuleCondition, + evalWindow time.Duration, + l, a map[string]string, + source string, +) (*ThresholdRule, error) { + + if int64(evalWindow) == 0 { + evalWindow = 5 * time.Minute + } + + if ruleCondition == nil { + return nil, fmt.Errorf("no rule condition") + } else if !ruleCondition.IsValid() { + return nil, fmt.Errorf("invalid rule condition") + } + + zap.S().Info("msg:", "creating new alerting rule", "\t name:", name, "\t condition:", ruleCondition.String()) + + return &ThresholdRule{ + id: id, + name: name, + source: source, + ruleCondition: ruleCondition, + evalWindow: evalWindow, + labels: labels.FromMap(l), + annotations: labels.FromMap(a), + + health: HealthUnknown, + active: map[uint64]*Alert{}, + }, nil +} + +func (r *ThresholdRule) Name() string { + return r.name +} + +func (r *ThresholdRule) ID() string { + return r.id +} + +func (r *ThresholdRule) Condition() *RuleCondition { + return r.ruleCondition +} + +func (r *ThresholdRule) GeneratorURL() string { + return r.source +} + +func (r *ThresholdRule) target() *float64 { + if r.ruleCondition == nil { + return nil + } + return r.ruleCondition.Target +} + +func (r *ThresholdRule) matchType() MatchType { + if r.ruleCondition == nil { + return AtleastOnce + } + return r.ruleCondition.MatchType +} + +func (r *ThresholdRule) compareOp() CompareOp { + if r.ruleCondition == nil { + return ValueIsEq + } + return r.ruleCondition.CompareOp +} + +func (r *ThresholdRule) Type() RuleType { + return RuleTypeThreshold +} + +func (r *ThresholdRule) SetLastError(err error) { + r.mtx.Lock() + defer r.mtx.Unlock() + r.lastError = err +} + +func (r *ThresholdRule) LastError() error { + r.mtx.Lock() + defer r.mtx.Unlock() + return r.lastError +} + +func (r *ThresholdRule) SetHealth(health RuleHealth) { + r.mtx.Lock() + defer r.mtx.Unlock() + r.health = health +} + +func (r *ThresholdRule) Health() RuleHealth { + r.mtx.Lock() + defer r.mtx.Unlock() + return r.health +} + +// SetEvaluationDuration updates evaluationDuration to the duration it took to evaluate the rule on its last evaluation. +func (r *ThresholdRule) SetEvaluationDuration(dur time.Duration) { + r.mtx.Lock() + defer r.mtx.Unlock() + r.evaluationDuration = dur +} + +func (r *ThresholdRule) HoldDuration() time.Duration { + return r.holdDuration +} + +func (r *ThresholdRule) EvalWindow() time.Duration { + return r.evalWindow +} + +// Labels returns the labels of the alerting rule. +func (r *ThresholdRule) Labels() labels.BaseLabels { + return r.labels +} + +// Annotations returns the annotations of the alerting rule. +func (r *ThresholdRule) Annotations() labels.BaseLabels { + return r.annotations +} + +func (r *ThresholdRule) sample(alert *Alert, ts time.Time) Sample { + lb := labels.NewBuilder(r.labels) + alertLabels := alert.Labels.(labels.Labels) + for _, l := range alertLabels { + lb.Set(l.Name, l.Value) + } + + lb.Set(labels.MetricNameLabel, alertMetricName) + lb.Set(labels.AlertNameLabel, r.name) + lb.Set(labels.AlertRuleIdLabel, r.ID()) + lb.Set(labels.AlertStateLabel, alert.State.String()) + + s := Sample{ + Metric: lb.Labels(), + Point: Point{T: timestamp.FromTime(ts), V: 1}, + } + return s +} + +// forStateSample returns the sample for ALERTS_FOR_STATE. +func (r *ThresholdRule) forStateSample(alert *Alert, ts time.Time, v float64) Sample { + lb := labels.NewBuilder(r.labels) + + alertLabels := alert.Labels.(labels.Labels) + for _, l := range alertLabels { + lb.Set(l.Name, l.Value) + } + + lb.Set(labels.MetricNameLabel, alertForStateMetricName) + lb.Set(labels.AlertNameLabel, r.name) + + s := Sample{ + Metric: lb.Labels(), + Point: Point{T: timestamp.FromTime(ts), V: v}, + } + return s +} + +// GetEvaluationDuration returns the time in seconds it took to evaluate the alerting rule. +func (r *ThresholdRule) GetEvaluationDuration() time.Duration { + r.mtx.Lock() + defer r.mtx.Unlock() + return r.evaluationDuration +} + +// SetEvaluationTimestamp updates evaluationTimestamp to the timestamp of when the rule was last evaluated. +func (r *ThresholdRule) SetEvaluationTimestamp(ts time.Time) { + r.mtx.Lock() + defer r.mtx.Unlock() + r.evaluationTimestamp = ts +} + +// GetEvaluationTimestamp returns the time the evaluation took place. +func (r *ThresholdRule) GetEvaluationTimestamp() time.Time { + r.mtx.Lock() + defer r.mtx.Unlock() + return r.evaluationTimestamp +} + +// State returns the maximum state of alert instances for this rule. +// StateFiring > StatePending > StateInactive +func (r *ThresholdRule) State() AlertState { + r.mtx.Lock() + defer r.mtx.Unlock() + + maxState := StateInactive + for _, a := range r.active { + if a.State > maxState { + maxState = a.State + } + } + return maxState +} + +func (r *ThresholdRule) currentAlerts() []*Alert { + r.mtx.Lock() + defer r.mtx.Unlock() + + alerts := make([]*Alert, 0, len(r.active)) + + for _, a := range r.active { + anew := *a + alerts = append(alerts, &anew) + } + return alerts +} + +func (r *ThresholdRule) ActiveAlerts() []*Alert { + var res []*Alert + for _, a := range r.currentAlerts() { + if a.ResolvedAt.IsZero() { + res = append(res, a) + } + } + return res +} + +// ForEachActiveAlert runs the given function on each alert. +// This should be used when you want to use the actual alerts from the ThresholdRule +// and not on its copy. +// If you want to run on a copy of alerts then don't use this, get the alerts from 'ActiveAlerts()'. +func (r *ThresholdRule) ForEachActiveAlert(f func(*Alert)) { + r.mtx.Lock() + defer r.mtx.Unlock() + + for _, a := range r.active { + f(a) + } +} + +func (r *ThresholdRule) SendAlerts(ctx context.Context, ts time.Time, resendDelay time.Duration, interval time.Duration, notifyFunc NotifyFunc) { + zap.S().Info("msg:", "initiating send alerts (if any)", "\t rule:", r.Name()) + alerts := []*Alert{} + r.ForEachActiveAlert(func(alert *Alert) { + if alert.needsSending(ts, resendDelay) { + alert.LastSentAt = ts + // Allow for two Eval or Alertmanager send failures. + delta := resendDelay + if interval > resendDelay { + delta = interval + } + alert.ValidUntil = ts.Add(4 * delta) + anew := *alert + alerts = append(alerts, &anew) + } else { + zap.S().Debugf("msg: skipping send alert due to resend delay", "\t rule: ", r.Name(), "\t alert:", alert.Labels) + } + }) + notifyFunc(ctx, "", alerts...) +} +func (r *ThresholdRule) CheckCondition(v float64) bool { + + if value.IsNaN(v) { + zap.S().Debugf("msg:", "found NaN in rule condition", "\t rule name:", r.Name()) + return false + } + + if r.ruleCondition.Target == nil { + zap.S().Debugf("msg:", "found null target in rule condition", "\t rulename:", r.Name()) + return false + } + + switch r.ruleCondition.CompareOp { + case ValueIsEq: + return v == *r.ruleCondition.Target + case ValueIsNotEq: + return v != *r.ruleCondition.Target + case ValueIsBelow: + return v < *r.ruleCondition.Target + case ValueIsAbove: + return v > *r.ruleCondition.Target + default: + return false + } +} + +func (r *ThresholdRule) prepareQueryRange(ts time.Time) *qsmodel.QueryRangeParamsV2 { + // todo(amol): add 30 seconds to evalWindow for rate calc + tsEnd := ts.UnixNano() / int64(time.Millisecond) + tsStart := ts.Add(-time.Duration(r.evalWindow)).UnixNano() / int64(time.Millisecond) + + // for k, v := range r.ruleCondition.CompositeMetricQuery.BuilderQueries { + // v.ReduceTo = qsmodel.RMAX + // r.ruleCondition.CompositeMetricQuery.BuilderQueries[k] = v + // } + + return &qsmodel.QueryRangeParamsV2{ + Start: tsStart, + End: tsEnd, + Step: 30, + CompositeMetricQuery: r.ruleCondition.CompositeMetricQuery, + } +} + +// queryClickhouse runs actual query against clickhouse +func (r *ThresholdRule) runChQuery(ctx context.Context, db clickhouse.Conn, query string) (Vector, error) { + rows, err := db.Query(ctx, query) + if err != nil { + zap.S().Errorf("rule:", r.Name(), "\t failed to get alert query result") + return nil, err + } + + columnTypes := rows.ColumnTypes() + if err != nil { + return nil, err + } + columnNames := rows.Columns() + if err != nil { + return nil, err + } + vars := make([]interface{}, len(columnTypes)) + + for i := range columnTypes { + vars[i] = reflect.New(columnTypes[i].ScanType()).Interface() + } + + // []sample list + var result Vector + + // map[fingerprint]sample + resultMap := make(map[uint64]Sample, 0) + + // for rates we want to skip the first record + // but we dont know when the rates are being used + // so we always pick timeframe - 30 seconds interval + // and skip the first record for a given label combo + skipFirstRecord := make(map[uint64]bool, 0) + + defer rows.Close() + for rows.Next() { + + if err := rows.Scan(vars...); err != nil { + return nil, err + } + + sample := Sample{} + lbls := labels.NewBuilder(labels.Labels{}) + + for i, v := range vars { + + colName := columnNames[i] + + switch v := v.(type) { + case *string: + lbls.Set(colName, *v) + case *time.Time: + timval := *v + + if colName == "ts" { + sample.Point.T = timval.Unix() + } else { + lbls.Set(colName, timval.Format("2006-01-02 15:04:05")) + } + + case *float64: + if colName == "res" || colName == "value" { + sample.Point.V = *v + + } else { + lbls.Set(colName, fmt.Sprintf("%f", *v)) + } + case *uint64: + intv := *v + if colName == "res" || colName == "value" { + sample.Point.V = float64(intv) + } else { + lbls.Set(colName, fmt.Sprintf("%d", intv)) + } + case *uint8: + intv := *v + if colName == "res" || colName == "value" { + sample.Point.V = float64(intv) + } else { + lbls.Set(colName, fmt.Sprintf("%d", intv)) + } + default: + zap.S().Errorf("ruleId:", r.ID(), "\t error: invalid var found in query result", v, columnNames[i]) + } + } + + if value.IsNaN(sample.Point.V) { + continue + } + + // capture lables in result + sample.Metric = lbls.Labels() + + labelHash := lbls.Labels().Hash() + + // here we walk through values of time series + // and calculate the final value used to compare + // with rule target + if existing, ok := resultMap[labelHash]; ok { + + switch r.matchType() { + case AllTheTimes: + if r.compareOp() == ValueIsAbove { + sample.Point.V = math.Min(existing.Point.V, sample.Point.V) + resultMap[labelHash] = sample + } else if r.compareOp() == ValueIsBelow { + sample.Point.V = math.Max(existing.Point.V, sample.Point.V) + resultMap[labelHash] = sample + } + case AtleastOnce: + if r.compareOp() == ValueIsAbove { + sample.Point.V = math.Max(existing.Point.V, sample.Point.V) + resultMap[labelHash] = sample + } else if r.compareOp() == ValueIsBelow { + sample.Point.V = math.Min(existing.Point.V, sample.Point.V) + resultMap[labelHash] = sample + } + case OnAverage: + sample.Point.V = (existing.Point.V + sample.Point.V) / 2 + resultMap[labelHash] = sample + case InTotal: + sample.Point.V = (existing.Point.V + sample.Point.V) + resultMap[labelHash] = sample + } + + } else { + if exists, _ := skipFirstRecord[labelHash]; exists { + resultMap[labelHash] = sample + } else { + // looks like the first record for this label combo, skip it + skipFirstRecord[labelHash] = true + } + } + } + + for _, sample := range resultMap { + // check alert rule condition before dumping results + if r.CheckCondition(sample.Point.V) { + result = append(result, sample) + } + } + + return result, nil +} + +// query looks if alert condition is being +// satisfied and returns the signals +func (r *ThresholdRule) buildAndRunQuery(ctx context.Context, ts time.Time, ch clickhouse.Conn) (Vector, error) { + params := r.prepareQueryRange(ts) + + runQueries := metrics.PrepareBuilderMetricQueries(params, constants.SIGNOZ_TIMESERIES_TABLENAME) + if runQueries.Err != nil { + return nil, fmt.Errorf("failed to prepare metric queries: %v", runQueries.Err) + } + + if len(runQueries.Queries) == 0 { + return nil, fmt.Errorf("no queries could be built with the rule config") + } + + zap.S().Debugf("ruleid:", r.ID(), "\t runQueries:", runQueries.Queries) + + // find target query label + if query, ok := runQueries.Queries["F1"]; ok { + // found a formula query, run with it + return r.runChQuery(ctx, ch, query) + } + + // no formula in rule condition, now look for + // query label with max ascii val + keys := make([]string, 0, len(runQueries.Queries)) + for k := range runQueries.Queries { + keys = append(keys, k) + } + sort.Strings(keys) + + queryLabel := keys[len(keys)-1] + + zap.S().Debugf("ruleId: ", r.ID(), "\t result query label:", queryLabel) + + if queryString, ok := runQueries.Queries[queryLabel]; ok { + return r.runChQuery(ctx, ch, queryString) + } + + zap.S().Errorf("ruleId: ", r.ID(), "\t invalid query label:", queryLabel, "\t queries:", runQueries.Queries) + return nil, fmt.Errorf("this is unexpected, invalid query label") +} + +func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time, queriers *Queriers) (interface{}, error) { + + res, err := r.buildAndRunQuery(ctx, ts, queriers.Ch) + + if err != nil { + r.SetHealth(HealthBad) + r.SetLastError(err) + zap.S().Debugf("ruleid:", r.ID(), "\t failure in buildAndRunQuery:", err) + return nil, err + } + + r.mtx.Lock() + defer r.mtx.Unlock() + + resultFPs := map[uint64]struct{}{} + var vec Vector + var alerts = make(map[uint64]*Alert, len(res)) + + for _, smpl := range res { + l := make(map[string]string, len(smpl.Metric)) + for _, lbl := range smpl.Metric { + l[lbl.Name] = lbl.Value + } + + tmplData := AlertTemplateData(l, smpl.V) + // Inject some convenience variables that are easier to remember for users + // who are not used to Go's templating system. + defs := "{{$labels := .Labels}}{{$value := .Value}}" + + expand := func(text string) string { + + tmpl := NewTemplateExpander( + ctx, + defs+text, + "__alert_"+r.Name(), + tmplData, + times.Time(timestamp.FromTime(ts)), + nil, + ) + result, err := tmpl.Expand() + if err != nil { + result = fmt.Sprintf("", err) + zap.S().Errorf("msg:", "Expanding alert template failed", "\t err", err, "\t data", tmplData) + } + return result + } + + lb := labels.NewBuilder(smpl.Metric).Del(labels.MetricNameLabel) + + for _, l := range r.labels { + lb.Set(l.Name, expand(l.Value)) + } + + lb.Set(labels.AlertNameLabel, r.Name()) + lb.Set(labels.AlertRuleIdLabel, r.ID()) + lb.Set(labels.RuleSourceLabel, r.GeneratorURL()) + + annotations := make(labels.Labels, 0, len(r.annotations)) + for _, a := range r.annotations { + annotations = append(annotations, labels.Label{Name: a.Name, Value: expand(a.Value)}) + } + + lbs := lb.Labels() + h := lbs.Hash() + resultFPs[h] = struct{}{} + + if _, ok := alerts[h]; ok { + zap.S().Errorf("ruleId: ", r.ID(), "\t msg:", "the alert query returns duplicate records:", alerts[h]) + err = fmt.Errorf("duplicate alert found, vector contains metrics with the same labelset after applying alert labels") + // We have already acquired the lock above hence using SetHealth and + // SetLastError will deadlock. + r.health = HealthBad + r.lastError = err + return nil, err + } + + alerts[h] = &Alert{ + Labels: lbs, + Annotations: annotations, + ActiveAt: ts, + State: StatePending, + Value: smpl.V, + GeneratorURL: r.GeneratorURL(), + } + } + + zap.S().Info("rule:", r.Name(), "\t alerts found: ", len(alerts)) + + // alerts[h] is ready, add or update active list now + for h, a := range alerts { + // Check whether we already have alerting state for the identifying label set. + // Update the last value and annotations if so, create a new alert entry otherwise. + if alert, ok := r.active[h]; ok && alert.State != StateInactive { + + alert.Value = a.Value + alert.Annotations = a.Annotations + continue + } + + r.active[h] = a + + } + + // Check if any pending alerts should be removed or fire now. Write out alert timeseries. + for fp, a := range r.active { + if _, ok := resultFPs[fp]; !ok { + // If the alert was previously firing, keep it around for a given + // retention time so it is reported as resolved to the AlertManager. + if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) { + delete(r.active, fp) + } + if a.State != StateInactive { + a.State = StateInactive + a.ResolvedAt = ts + } + continue + } + + if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration { + a.State = StateFiring + a.FiredAt = ts + } + + } + r.health = HealthGood + r.lastError = err + return vec, nil + +} + +func (r *ThresholdRule) String() string { + + ar := PostableRule{ + Alert: r.name, + RuleCondition: r.ruleCondition, + EvalWindow: Duration(r.evalWindow), + Labels: r.labels.Map(), + Annotations: r.annotations.Map(), + } + + byt, err := yaml.Marshal(ar) + if err != nil { + return fmt.Sprintf("error marshaling alerting rule: %s", err.Error()) + } + + return string(byt) +} diff --git a/pkg/query-service/tests/test-deploy/clickhouse-config.xml b/pkg/query-service/tests/test-deploy/clickhouse-config.xml index 3bb26a3a36..4a6a82b8af 100644 --- a/pkg/query-service/tests/test-deploy/clickhouse-config.xml +++ b/pkg/query-service/tests/test-deploy/clickhouse-config.xml @@ -22,7 +22,7 @@ [1]: https://github.com/pocoproject/poco/blob/poco-1.9.4-release/Foundation/include/Poco/Logger.h#L105-L114 --> - trace + information /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.err.log