#!/usr/bin/env uv run python3

from google.cloud import bigquery
import pandas as pd
import os

print('Starting...')

# Set your project
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/reuven/Downloads/bw-151-0ffffc35f2a0.json'

# Create client with explicit project
client = bigquery.Client(project='bw-151')

query = """
WITH monthly_python_versions AS (
  SELECT
    DATE_TRUNC(DATE(timestamp), MONTH) as month,
    REGEXP_EXTRACT(details.python, r"[0-9]+\.[0-9]+") AS python_version,
    COUNT(*) as downloads
  FROM
    `bigquery-public-data.pypi.file_downloads`
  WHERE
    DATE(timestamp) BETWEEN '2025-01-01' AND '2025-12-31'
    AND details.python IS NOT NULL
  GROUP BY
    month, python_version
),
ranked_versions AS (
  SELECT
    month,
    python_version,
    downloads,
    ROW_NUMBER() OVER (PARTITION BY month ORDER BY downloads DESC) as rank
  FROM
    monthly_python_versions
)
SELECT
  month,
  python_version,
  downloads,
  rank
FROM
  ranked_versions
WHERE
  rank <= 10
ORDER BY
  month, rank
"""


# Run query and get results as pandas DataFrame
df = client.query(query).to_dataframe()

print(f'Downloaded {len(df.index)} records. Saving to parquet...')
df.to_parquet('bw-151-versions-per-month.parquet')
print('Done.')
