Unverified Commit 7d221443 authored by Silvano Cerza's avatar Silvano Cerza Committed by GitHub

[skip changelog] Update workflow and script to fetch Arduino CDN download data (#1476)

parent be520ef8
import boto3
import semver
import os
import logging
import uuid
import time
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
log = logging.getLogger()
logging.getLogger("boto3").setLevel(logging.CRITICAL)
logging.getLogger("botocore").setLevel(logging.CRITICAL)
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
def execute(client, statement, dest_s3_output_location):
log.info("execute query: {} dumping in {}".format(statement, dest_s3_output_location))
result = client.start_query_execution(
QueryString=statement,
ClientRequestToken=str(uuid.uuid4()),
QueryExecutionContext={"Database": "etl_kpi_prod_hwfw"},
ResultConfiguration={
"OutputLocation": dest_s3_output_location,
},
)
execution_id = result["QueryExecutionId"]
log.info("wait for query {} completion".format(execution_id))
wait_for_query_execution_completion(client, execution_id)
log.info("operation successful")
return execution_id
def wait_for_query_execution_completion(client, query_execution_id):
query_ended = False
while not query_ended:
query_execution = client.get_query_execution(QueryExecutionId=query_execution_id)
state = query_execution["QueryExecution"]["Status"]["State"]
if state == "SUCCEEDED":
query_ended = True
elif state in ["FAILED", "CANCELLED"]:
raise BaseException(
"query failed or canceled: {}".format(query_execution["QueryExecution"]["Status"]["StateChangeReason"])
)
else:
time.sleep(1)
def valid(key):
split = key.split("_")
if len(split) < 1:
return False
try:
semver.parse(split[0])
except ValueError:
return False
return True
def get_results(client, execution_id):
results_paginator = client.get_paginator("get_query_results")
results_iter = results_paginator.paginate(QueryExecutionId=execution_id, PaginationConfig={"PageSize": 1000})
res = {}
for results_page in results_iter:
for row in results_page["ResultSet"]["Rows"][1:]:
# Loop through the JSON objects
key = row["Data"][0]["VarCharValue"]
if valid(key):
res[key] = row["Data"][1]["VarCharValue"]
return res
def convert_data(data):
result = []
for key, value in data.items():
# 0.18.0_macOS_64bit.tar.gz
split_key = key.split("_")
if len(split_key) != 3:
continue
(version, os_version, arch) = split_key
arch_split = arch.split(".")
if len(arch_split) < 1:
continue
arch = arch_split[0]
if len(arch) > 10:
# This can't be an architecture really.
# It's an ugly solution but works for now so deal with it.
continue
repo = os.environ["GITHUB_REPOSITORY"].split("/")[1]
result.append(
{
"type": "gauge",
"name": "arduino.downloads.total",
"value": value,
"host": os.environ["GITHUB_REPOSITORY"],
"tags": [
f"version:{version}",
f"os:{os_version}",
f"arch:{arch}",
"cdn:downloads.arduino.cc",
f"project:{repo}",
],
}
)
return result
if __name__ == "__main__":
DEST_S3_OUTPUT = os.environ["AWS_ATHENA_OUTPUT_LOCATION"]
AWS_ATHENA_SOURCE_TABLE = os.environ["AWS_ATHENA_SOURCE_TABLE"]
session = boto3.session.Session(region_name="us-east-1")
athena_client = session.client("athena")
query = f"""SELECT replace(json_extract_scalar(url_decode(url_decode(querystring)),
'$.data.url'), 'https://downloads.arduino.cc/arduino-cli/arduino-cli_', '')
AS flavor, count(json_extract(url_decode(url_decode(querystring)),'$')) AS gauge
FROM {AWS_ATHENA_SOURCE_TABLE}
WHERE json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url')
LIKE 'https://downloads.arduino.cc/arduino-cli/arduino-cli_%'
AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url')
NOT LIKE '%latest%' -- exclude latest redirect
AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url')
NOT LIKE '%alpha%' -- exclude early alpha releases
AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url')
NOT LIKE '%.tar.bz2%' -- exclude very old releases archive formats
group by 1 ;"""
exec_id = execute(athena_client, query, DEST_S3_OUTPUT)
results = get_results(athena_client, exec_id)
result_json = convert_data(results)
print(f"::set-output name=result::{result_json}")
#!/usr/bin/env bash
# This script performs the following:
# 1. Run the query, use jq to capture the QueryExecutionId, and then capture that into bash variable
# 2. Wait for the query to finish running (240 seconds).
# 3. Get the results.
# 4. Json data points struct build
# Expected env variables are:
# AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY for accessing AWS resources
# AWS_ATHENA_SOURCE_TABLE
# AWS_ATHENA_OUTPUT_LOCATION
# GITHUB_REPOSITORY
set -euo pipefail
loadExecutionId=$(
aws athena start-query-execution \
--query-string "MSCK REPAIR TABLE ${AWS_ATHENA_SOURCE_TABLE};" \
--result-configuration "OutputLocation=${AWS_ATHENA_OUTPUT_LOCATION}" \
--region us-east-1 | jq -r ".QueryExecutionId"
)
echo "QueryExecutionId is ${loadExecutionId}"
for i in $(seq 1 120); do
loadState=$(
aws athena get-query-execution \
--query-execution-id "${loadExecutionId}" \
--region us-east-1 | jq -r ".QueryExecution.Status.State"
)
if [[ "${loadState}" == "SUCCEEDED" ]]; then
break
fi
echo "QueryExecutionId ${loadExecutionId} - state is ${loadState}"
if [[ "${loadState}" == "FAILED" ]]; then
exit 1
fi
sleep 2
done
! read -r -d '' query <<EOM
SELECT replace(json_extract_scalar(url_decode(url_decode(querystring)),
'$.data.url'), 'https://downloads.arduino.cc/arduino-cli/arduino-cli_', '') AS flavor, count(json_extract(url_decode(url_decode(querystring)),'$')) AS gauge
FROM ${AWS_ATHENA_SOURCE_TABLE}
WHERE json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') LIKE 'https://downloads.arduino.cc/arduino-cli/arduino-cli_%'
AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') NOT LIKE '%latest%' -- exclude latest redirect
AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') NOT LIKE '%alpha%' -- exclude early alpha releases
AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') NOT LIKE '%.tar.bz2%' -- exclude very old releases archive formats
group by 1 ;
EOM
queryExecutionId=$(
aws athena start-query-execution \
--query-string "${query}" \
--result-configuration "OutputLocation=${AWS_ATHENA_OUTPUT_LOCATION}" \
--region us-east-1 | jq -r ".QueryExecutionId"
)
echo "QueryExecutionId is ${queryExecutionId}"
for i in $(seq 1 120); do
queryState=$(
aws athena get-query-execution \
--query-execution-id "${queryExecutionId}" \
--region us-east-1 | jq -r ".QueryExecution.Status.State"
)
if [[ "${queryState}" == "SUCCEEDED" ]]; then
break
fi
echo "QueryExecutionId ${queryExecutionId} - state is ${queryState}"
if [[ "${queryState}" == "FAILED" ]]; then
exit 1
fi
sleep 2
done
echo "Query succeeded. Processing data"
queryResult=$(
aws athena get-query-results \
--query-execution-id "${queryExecutionId}" \
--region us-east-1 | jq --compact-output
)
! read -r -d '' jsonTemplate <<EOM
{
"type": "gauge",
"name": "arduino.downloads.total",
"value": "%s",
"host": "${GITHUB_REPOSITORY}",
"tags": [
"version:%s",
"os:%s",
"arch:%s",
"cdn:downloads.arduino.cc",
"project:arduino-cli"
]
},
EOM
datapoints="["
for row in $(echo "${queryResult}" | jq 'del(.ResultSet.Rows[0])' | jq -r '.ResultSet.Rows[] | .Data' --compact-output); do
value=$(jq -r ".[1].VarCharValue" <<<"${row}")
tag=$(jq -r ".[0].VarCharValue" <<<"${row}")
# Some splitting to obtain 0.6.0, Windows, 32bit elements from string 0.6.0_Windows_32bit.zip
split=($(echo "$tag" | tr '_' '\n'))
if [[ ${#split[@]} -ne 3 ]]; then
continue
fi
archSplit=($(echo "${split[2]}" | tr '.' '\n'))
datapoints+=$(printf "${jsonTemplate}" "${value}" "${split[0]}" "${split[1]}" "${archSplit[0]}")
done
datapoints="${datapoints::-1}]"
echo "::set-output name=result::$(jq --compact-output <<<"${datapoints}")"
...@@ -18,6 +18,10 @@ jobs: ...@@ -18,6 +18,10 @@ jobs:
- name: Checkout - name: Checkout
uses: actions/checkout@v2 uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.x"
- name: Fetch downloads count form Arduino CDN using AWS Athena - name: Fetch downloads count form Arduino CDN using AWS Athena
id: fetch id: fetch
env: env:
...@@ -27,11 +31,8 @@ jobs: ...@@ -27,11 +31,8 @@ jobs:
AWS_ATHENA_OUTPUT_LOCATION: ${{ secrets.STATS_AWS_ATHENA_OUTPUT_LOCATION }} AWS_ATHENA_OUTPUT_LOCATION: ${{ secrets.STATS_AWS_ATHENA_OUTPUT_LOCATION }}
GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_REPOSITORY: ${{ github.repository }}
run: | run: |
# Fetch jq 1.6 as VM has only 1.5 ATM pip install boto3 semver
wget -q https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 -O jq python .github/tools/fetch_athena_stats.py
chmod +x jq
PATH="${{ github.workspace }}:$PATH"
.github/tools/fetch_athena_stats.sh
- name: Send metrics - name: Send metrics
uses: masci/datadog@v1 uses: masci/datadog@v1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment