-
-
Notifications
You must be signed in to change notification settings - Fork 203
Generative AI 2025 queries #4302
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This SQL query calculates the percentage of sites using CSS gradients over specific years, grouped by client and rank.
This SQL query calculates the percentage of sites with a valid robots.txt file, including various directives, based on data from the HTTP Archive crawl.
This SQL script calculates the percentage of sites mentioning a specific user-agent in their robots.txt file, categorized by rank bucket. Inspired by https://paulcalvano.com/2025-08-21-ai-bots-and-robots-txt/
This SQL query analyzes the adoption of CSS gradients by clients over specific years, grouping results by rank and calculating the percentage of sites using gradients.
eb9f816 to
7e499b3
Compare
7e499b3 to
5c6f001
Compare
tunetheweb
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but suggesting some formatting improvements
| is_root_page AND | ||
| client IN ('desktop', 'mobile') AND | ||
| date IN ('2022-06-01', '2025-07-01') AND | ||
| rank <= 10000000 AND |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| rank <= 10000000 AND | |
| rank <= 10000000 AND -- later years go beyond 10,000,000 but let's keep to this limit for consistency |
| WITH ranks AS ( | ||
| SELECT 1000 AS rank_grouping | ||
| UNION ALL | ||
| SELECT 10000 | ||
| UNION ALL | ||
| SELECT 100000 | ||
| UNION ALL | ||
| SELECT 1000000 | ||
| UNION ALL | ||
| SELECT 10000000 | ||
| UNION ALL | ||
| SELECT 100000000 | ||
| ) | ||
|
|
||
| SELECT | ||
| EXTRACT(YEAR FROM date) AS year, | ||
| client, | ||
| r.rank_grouping, | ||
| COUNT(DISTINCT page) AS total_sites, | ||
| COUNT(DISTINCT IF( | ||
| REGEXP_CONTAINS( | ||
| TO_JSON_STRING(custom_metrics.css_variables), | ||
| r'(?i)gradient\(' | ||
| ), | ||
| page, | ||
| NULL | ||
| )) AS sites_using_gradient, | ||
| SAFE_DIVIDE( | ||
| COUNT(DISTINCT IF( | ||
| REGEXP_CONTAINS( | ||
| TO_JSON_STRING(custom_metrics.css_variables), | ||
| r'(?i)gradient\(' | ||
| ), | ||
| page, | ||
| NULL | ||
| )), | ||
| COUNT(DISTINCT page) | ||
| ) AS pct_sites_using_gradient | ||
| FROM `httparchive.crawl.pages` | ||
| CROSS JOIN ranks r | ||
| WHERE | ||
| is_root_page AND | ||
| rank <= r.rank_grouping AND | ||
| date IN ( | ||
| DATE '2019-07-01', | ||
| DATE '2020-08-01', | ||
| DATE '2021-07-01', | ||
| DATE '2022-07-01', -- CSS metrics exception | ||
| DATE '2024-06-01', | ||
| DATE '2025-07-01' | ||
| ) | ||
| GROUP BY year, client, r.rank_grouping | ||
| ORDER BY year, client, r.rank_grouping; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| WITH ranks AS ( | |
| SELECT 1000 AS rank_grouping | |
| UNION ALL | |
| SELECT 10000 | |
| UNION ALL | |
| SELECT 100000 | |
| UNION ALL | |
| SELECT 1000000 | |
| UNION ALL | |
| SELECT 10000000 | |
| UNION ALL | |
| SELECT 100000000 | |
| ) | |
| SELECT | |
| EXTRACT(YEAR FROM date) AS year, | |
| client, | |
| r.rank_grouping, | |
| COUNT(DISTINCT page) AS total_sites, | |
| COUNT(DISTINCT IF( | |
| REGEXP_CONTAINS( | |
| TO_JSON_STRING(custom_metrics.css_variables), | |
| r'(?i)gradient\(' | |
| ), | |
| page, | |
| NULL | |
| )) AS sites_using_gradient, | |
| SAFE_DIVIDE( | |
| COUNT(DISTINCT IF( | |
| REGEXP_CONTAINS( | |
| TO_JSON_STRING(custom_metrics.css_variables), | |
| r'(?i)gradient\(' | |
| ), | |
| page, | |
| NULL | |
| )), | |
| COUNT(DISTINCT page) | |
| ) AS pct_sites_using_gradient | |
| FROM `httparchive.crawl.pages` | |
| CROSS JOIN ranks r | |
| WHERE | |
| is_root_page AND | |
| rank <= r.rank_grouping AND | |
| date IN ( | |
| DATE '2019-07-01', | |
| DATE '2020-08-01', | |
| DATE '2021-07-01', | |
| DATE '2022-07-01', -- CSS metrics exception | |
| DATE '2024-06-01', | |
| DATE '2025-07-01' | |
| ) | |
| GROUP BY year, client, r.rank_grouping | |
| ORDER BY year, client, r.rank_grouping; | |
| #standardSQL | |
| -- Adoption of CSS gradients in custom_metrics.css_variables | |
| -- Grouped by: year, client, rank bucket | |
| SELECT | |
| EXTRACT(YEAR FROM date) AS year, | |
| client, | |
| rank_grouping, | |
| COUNT(DISTINCT page) AS total_sites, | |
| COUNT(DISTINCT IF( | |
| REGEXP_CONTAINS( | |
| TO_JSON_STRING(custom_metrics.css_variables), | |
| r'(?i)gradient\(' | |
| ), | |
| page, | |
| NULL | |
| )) AS sites_using_gradient, | |
| SAFE_DIVIDE( | |
| COUNT(DISTINCT IF( | |
| REGEXP_CONTAINS( | |
| TO_JSON_STRING(custom_metrics.css_variables), | |
| r'(?i)gradient\(' | |
| ), | |
| page, | |
| NULL | |
| )), | |
| COUNT(DISTINCT page) | |
| ) AS pct_sites_using_gradient | |
| FROM | |
| `httparchive.crawl.pages`, | |
| UNNEST ([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping | |
| WHERE | |
| is_root_page AND | |
| rank <= rank_grouping AND | |
| date IN ( | |
| DATE '2019-07-01', | |
| DATE '2020-08-01', | |
| DATE '2021-07-01', | |
| DATE '2022-07-01', -- CSS metrics exception | |
| DATE '2024-06-01', | |
| DATE '2025-07-01' | |
| ) | |
| GROUP BY | |
| year, | |
| client, | |
| rank_grouping | |
| ORDER BY | |
| year, | |
| client, | |
| rank_grouping; |
| FROM `httparchive.crawl.pages` | ||
| WHERE | ||
| is_root_page AND | ||
| date IN ( | ||
| DATE '2019-07-01', | ||
| DATE '2020-08-01', | ||
| DATE '2021-07-01', | ||
| DATE '2022-07-01', | ||
| DATE '2024-06-01', | ||
| DATE '2025-07-01' | ||
| ) | ||
| GROUP BY year, client, rank | ||
| ORDER BY year, client, rank; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| FROM `httparchive.crawl.pages` | |
| WHERE | |
| is_root_page AND | |
| date IN ( | |
| DATE '2019-07-01', | |
| DATE '2020-08-01', | |
| DATE '2021-07-01', | |
| DATE '2022-07-01', | |
| DATE '2024-06-01', | |
| DATE '2025-07-01' | |
| ) | |
| GROUP BY year, client, rank | |
| ORDER BY year, client, rank; | |
| FROM | |
| `httparchive.crawl.pages` | |
| WHERE | |
| is_root_page AND | |
| date IN ( | |
| DATE '2019-07-01', | |
| DATE '2020-08-01', | |
| DATE '2021-07-01', | |
| DATE '2022-07-01', | |
| DATE '2024-06-01', | |
| DATE '2025-07-01' | |
| ) | |
| GROUP BY | |
| year, | |
| client, | |
| rank | |
| ORDER BY | |
| year, | |
| client, | |
| rank; |
| EXISTS( | ||
| SELECT 1 FROM UNNEST(technologies) AS t WHERE t.technology = 'Tailwind CSS' | ||
| ) AS uses_tailwind, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| EXISTS( | |
| SELECT 1 FROM UNNEST(technologies) AS t WHERE t.technology = 'Tailwind CSS' | |
| ) AS uses_tailwind, | |
| 'Tailwind CSS' IN UNNEST(technologies.technology) AS uses_tailwind, |
| FROM `httparchive.crawl.pages` | ||
| WHERE date = '2025-07-01' AND | ||
| is_root_page |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| FROM `httparchive.crawl.pages` | |
| WHERE date = '2025-07-01' AND | |
| is_root_page | |
| FROM | |
| `httparchive.crawl.pages` | |
| WHERE | |
| date = '2025-07-01' AND | |
| is_root_page |
| SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, | ||
| JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, | |
| JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua | |
| SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.status) AS INT64) AS status, | |
| custom_metrics.robots_txt.record_counts.by_useragent AS byua |
| FROM `httparchive.crawl.pages` | ||
| WHERE date IN ('2019-07-01', '2020-08-01', '2021-07-01', '2022-06-01', '2024-06-01', '2025-07-01') AND |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| FROM `httparchive.crawl.pages` | |
| WHERE date IN ('2019-07-01', '2020-08-01', '2021-07-01', '2022-06-01', '2024-06-01', '2025-07-01') AND | |
| FROM | |
| `httparchive.crawl.pages` | |
| WHERE | |
| date IN ('2019-07-01', '2020-08-01', '2021-07-01', '2022-06-01', '2024-06-01', '2025-07-01') AND |
| k.date, k.client, k.rank, k.root_page, k.agent, | ||
| getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj, | ||
| b.status |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| k.date, k.client, k.rank, k.root_page, k.agent, | |
| getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj, | |
| b.status | |
| k.date, | |
| k.client, | |
| k.rank, | |
| k.root_page, | |
| k.agent, | |
| getByAgent(b.byua, k.agent) AS agent_obj, | |
| b.status |
| date, client, rank, root_page, agent, status, | ||
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + | ||
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + | ||
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + | ||
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + | ||
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum | ||
| FROM ua_presence |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| date, client, rank, root_page, agent, status, | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum | |
| FROM ua_presence | |
| date, | |
| client, | |
| rank, | |
| root_page, | |
| agent, | |
| status, | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.allow) AS INT64), 0) + | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.disallow) AS INT64), 0) + | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.crawl_delay) AS INT64), 0) + | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.noindex) AS INT64), 0) + | |
| COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.other) AS INT64), 0) AS rules_sum | |
| FROM | |
| ua_presence |
| date IN ( | ||
| '2020-01-01', '2020-04-01', '2020-07-01', '2020-10-01', | ||
| '2021-01-01', '2021-04-01', '2021-07-01', '2021-10-01', | ||
| '2022-01-01', '2022-04-01', '2022-07-01', '2022-10-01', | ||
| '2023-01-01', '2023-04-01', '2023-07-01', '2023-10-01', | ||
| '2024-01-01', '2024-04-01', '2024-07-01', '2024-10-01', | ||
| '2025-01-01', '2025-04-01', '2025-07-01', '2025-10-01' | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems quite a random set of dates. I presume you're looking at quarterly? why no tjust get every date?
| date IN ( | |
| '2020-01-01', '2020-04-01', '2020-07-01', '2020-10-01', | |
| '2021-01-01', '2021-04-01', '2021-07-01', '2021-10-01', | |
| '2022-01-01', '2022-04-01', '2022-07-01', '2022-10-01', | |
| '2023-01-01', '2023-04-01', '2023-07-01', '2023-10-01', | |
| '2024-01-01', '2024-04-01', '2024-07-01', '2024-10-01', | |
| '2025-01-01', '2025-04-01', '2025-07-01', '2025-10-01' | |
| ) | |
| date >= '2020-01-01' |
|
@jcmpagel Could you have a look at @tunetheweb's suggestions? |
Makes progress on #4104