add metatargetr page

favstats · Dec 27, 2024 · 3713c07 · 3713c07
1 parent f5c0d0e
commit 3713c07
Show file tree

Hide file tree

Showing 21 changed files with 596 additions and 66 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,3 +3,7 @@
 ^LICENSE\.md$
 ^README\.Rmd$
 ^cran-comments\.md$
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
+^\.github$
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -0,0 +1,49 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+  release:
+    types: [published]
+  workflow_dispatch:
+
+name: pkgdown.yaml
+
+permissions: read-all
+
+jobs:
+  pkgdown:
+    runs-on: ubuntu-latest
+    # Only restrict concurrency for non-PR jobs
+    concurrency:
+      group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::pkgdown, local::.
+          needs: website
+
+      - name: Build site
+        run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
+        shell: Rscript {0}
+
+      - name: Deploy to GitHub pages 🚀
+        if: github.event_name != 'pull_request'
+        uses: JamesIves/[email protected]
+        with:
+          clean: false
+          branch: gh-pages
+          folder: docs
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@
 .httr-oauth
 .DS_Store
 data
+docs
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -28,5 +28,5 @@ Suggests:
     OpenImageR,
     testthat (>= 3.0.0),
     arrow
-URL: https://github.com/favstats/metatargetr
+URL: https://github.com/favstats/metatargetr, https://favstats.github.io/metatargetr/
 BugReports: https://github.com/favstats/metatargetr/issues
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,15 +12,23 @@ export(get_targeting_db)
 export(ggl_get_spending)
 export(map_dfr_progress)
 export(parse_location)
+export(retrieve_targeting_metadata)
 export(stupid_conversion)
 export(unnest_and_fix_dups)
 export(walk_progress)
 importFrom(arrow,read_parquet)
+importFrom(dplyr,arrange)
 importFrom(dplyr,bind_cols)
+importFrom(dplyr,filter)
 importFrom(dplyr,left_join)
+importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_all)
+importFrom(dplyr,rename)
 importFrom(dplyr,select)
 importFrom(dplyr,slice)
+importFrom(dplyr,transmute)
+importFrom(httr,GET)
+importFrom(httr,content)
 importFrom(httr2,req_body_raw)
 importFrom(httr2,req_headers)
 importFrom(httr2,req_perform)
@@ -34,7 +42,10 @@ importFrom(purrr,is_empty)
 importFrom(purrr,map_dfr)
 importFrom(purrr,set_names)
 importFrom(rvest,html_element)
+importFrom(rvest,html_elements)
 importFrom(rvest,html_text)
 importFrom(stringr,str_remove)
 importFrom(stringr,str_split)
 importFrom(tibble,as_tibble)
+importFrom(tibble,tibble)
+importFrom(tidyr,separate)
diff --git a/R/data.R b/R/data.R
@@ -72,29 +72,6 @@ get_targeting_db <- function(the_cntry, tf, ds, remove_nas = T, verbose = F) {
 # latest_data %>% filter(is.na(no_data))
 
 
-
-#' Retrieve Report Data from GitHub Repository
-#'
-#' This function retrieves a report for a specific country and timeframe
-#' from a GitHub repository hosting RDS files. The file is downloaded
-#' to a temporary location, read into R, and then deleted.
-#'
-#' @param the_cntry Character. The ISO country code (e.g., "DE", "US").
-#' @param timeframe Character. One of "-yesterday", "-last_7_days", "-last_30_days", "-last_90_days", or "-lifelong".
-#' @param file_name Character. The name of the RDS file to download (e.g., "report_2024-12-25.rds").
-#' @param verbose Logical. Whether to print messages about the process. Default is `FALSE`.
-#' @return A data frame or object read from the RDS file.
-#' @export
-#'
-#' @examples
-#' # Example usage
-#' report_data <- get_report_db(
-#'   the_cntry = "DE",
-#'   timeframe = "-last_30_days",
-#'   file_name = "report_2024-12-25.rds",
-#'   verbose = TRUE
-#' )
-#' print(head(report_data))
 #' Retrieve Report Data from GitHub Repository
 #'
 #' This function retrieves a report for a specific country and timeframe
@@ -185,3 +162,77 @@ get_report_db <- function(the_cntry, timeframe, ds, verbose = FALSE) {
 #   ds = "2024-10-25",
 #   verbose = TRUE
 # )
+
+
+#' Retrieve Metadata for Targeting Data
+#'
+#' This function retrieves metadata for targeting data releases for a specific
+#' country and timeframe from a GitHub repository.
+#'
+#' @param country_code Character. The ISO country code (e.g., "DE", "US").
+#' @param timeframe Character. The timeframe to filter (e.g., "7", "30", or "90").
+#' @param base_url Character. The base URL for the GitHub repository. Defaults to
+#' `"https://github.com/favstats/meta_ad_targeting/releases/"`.
+#' @return A data frame containing metadata about available targeting data,
+#' including file names, sizes, timestamps, and tags.
+#' @importFrom httr GET content
+#' @importFrom rvest html_elements html_text
+#' @importFrom dplyr transmute mutate filter rename arrange
+#' @importFrom tidyr separate
+#' @importFrom tibble tibble
+#' @export
+#'
+#' @examples
+#' # Retrieve metadata for Germany for the last 30 days
+#' metadata <- retrieve_targeting_metadata("DE", "30")
+#' print(metadata)
+retrieve_targeting_metadata <- function(country_code,
+                                        timeframe,
+                                        base_url = "https://github.com/favstats/meta_ad_targeting/releases/expanded_assets/") {
+    # Validate inputs
+    if (missing(country_code)) {
+        stop("Parameter `country_code` is required.")
+    }
+
+    if (missing(timeframe) || !timeframe %in% c("7", "30", "90")) {
+        stop("`timeframe` must be one of: '7', '30', or '90'.")
+    }
+
+    # Timeframe suffix for filtering
+    timeframe_suffix <- paste0("-last_", timeframe, "_days")
+
+    # Construct the full URL
+    url <- paste0(base_url, country_code, timeframe_suffix)
+
+    # Fetch the data
+    response <- httr::GET(url)
+
+    if (httr::status_code(response) != 200) {
+        stop("Failed to retrieve metadata from: ", url, ". Status code: ", httr::status_code(response))
+    }
+
+    html_content <- xml2::read_html(httr::content(response, as = "text", encoding = "UTF-8"))
+
+    raw_elements <- rvest::html_elements(html_content, ".Box-row") %>%
+        rvest::html_text()
+
+    metadata <- tibble::tibble(raw = raw_elements) %>%
+        dplyr::mutate(raw = strsplit(as.character(raw), "\n")) %>%
+        dplyr::transmute(
+            filename = sapply(raw, function(x) trimws(x[3])),
+            file_size = sapply(raw, function(x) trimws(x[6])),
+            timestamp = sapply(raw, function(x) trimws(x[7]))
+        ) %>%
+        dplyr::filter(filename != "Source code") %>%
+        dplyr::mutate(release = paste0(country_code, timeframe_suffix)) %>%
+        dplyr::mutate_all(as.character) %>%
+        dplyr::rename(tag = release, file_name = filename) %>%
+        dplyr::arrange(desc(tag)) %>%
+        tidyr::separate(tag, into = c("cntry", "tframe"), sep = "-", remove = FALSE) %>%
+        dplyr::mutate(ds = stringr::str_remove(file_name, "\\.rds|\\.zip|\\.parquet")) %>%
+        dplyr::distinct(cntry, ds, tframe) %>%
+        tidyr::drop_na(ds) %>%
+        dplyr::arrange(desc(ds))
+
+    return(metadata)
+}
diff --git a/R/get_page_insights.R b/R/get_page_insights.R
@@ -53,9 +53,6 @@ if("page_info" %in% include_info ){
 ua <- sample(ua_list, 1)
 # print(ua)
 
-# pageid <- "7860876103"
-# timeframe <- "90"
-
 # Define static parameters
 static_params <- list(
   av = "0",                        # Likely application version; may not change often. Optional.
@@ -64,20 +61,8 @@ static_params <- list(
   a = "1",                         # Arbitrary request parameter; purpose unclear but likely required.
   req = "3",                       # Request parameter; often a sequence or batch request identifier. Likely required.
   hs = "19797.BP%3ADEFAULT.2.0..0.0", # Host session or configuration metadata; required for server-side routing.
-  # dpr = "1",                       # Device Pixel Ratio; reflects screen resolution. Optional but often included.
   ccg = "EXCELLENT",               # Connection grade; describes network quality. Optional but useful for server-side optimizations.
-  # rev = "1012093869",              # Revision/version number; likely application or API version. Required.
-  # s = "sbbnic%3Awquopy%3A7r1j3c",  # Session token or tracking identifier; unique to the visitor. Required.
-  # hsi = "7346737420686302672",     # Hashed Session ID; unique to the visitor. Required.
-  # dyn = "7xe6Eiw_K9zo5ObwKBAgc9o2exu13wqojyUW3qi4EoxW4E7SewXwCwfW7oqx60Vo1upEK12wvk1bwbG78b87C2m3K2y11wBw5Zx62G3i1ywdl0Fw4Hwp8kwyx2cU8EmwoHwrUcUjwVw9O7bK2S2W2K4EG1Mxu16wciaw4JwJwSyES0gq0K-1LwqobU2cwmo6O1Fw44wt8",
-  # Dynamic parameters encoded in a proprietary format; likely unique to each request. Required.
-
   csr = "",                        # CSRF token; placeholder here, likely required in some contexts.
-  # lsd = "AVo6-wl7l1Q",             # Login session data; required for session validation.
-  # jazoest = "2881",                # CSRF-related field; required for security checks.
-  # spin_r = "1012093869",           # Spin-related metadata (server-specific session management). Required.
-  # spin_b = "trunk",                # Backend branch/version. Required for routing to the correct API version.
-  # spin_t = "1710545602",           # Server timestamp. Required for ensuring request freshness.
   `_jssesw` = "1",                 # Encoded session value. Required for session management.
   fb_api_caller_class = "RelayModern", # API metadata describing the client. Required.
   fb_api_req_friendly_name = "AdLibraryMobileFocusedStateProviderQuery", # API-friendly name for request logging. Optional.
@@ -97,19 +82,13 @@ variables <- jsonlite::toJSON(
     active_status = "ALL",                  # Filter for active/inactive ads. Required.
     ad_type = "POLITICAL_AND_ISSUE_ADS",    # Type of ads (repeated for clarity). Required.
     bylines = list(),                       # List of bylines to filter ads. Optional.
-    # collation_token = "7ca3912f-0148-43ce-83e4-9a68ef656e4d",
-    # Unique token for grouping or collation; may be session-based. Likely required.
-
     content_languages = list(),             # Filter for content languages. Optional.
     count = 30,                             # Number of results to fetch. Optional but usually required for pagination.
     countries = list(iso2c),                # List of countries for filtering (repeated for clarity). Required.
     excluded_ids = list(),                  # IDs to exclude from results. Optional.
     full_text_search_field = "ALL",         # Full-text search field filter. Optional.
     group_by_modes = list(),                # Grouping modes for results. Optional.
     search_type = "PAGE",                   # Type of search (e.g., by page). Required.
-    # session_id = "1678877b-700b-485a-abb0-60efcb6b4019",
-    # Unique session identifier for the query. Required for tracking.
-
     sort_data = list(
       mode = "SORT_BY_RELEVANCY_MONTHLY_GROUPED", # Sorting mode. Required.
       direction = "ASCENDING"                    # Sorting direction. Required.

diff --git a/README.Rmd b/README.Rmd
@@ -115,6 +115,32 @@ page_info <- get_page_insights("121264564551002", include_info = "page_info")
 str(page_info)
 ```
 
+## `retrieve_targeting_metadata()`
+
+The `retrieve_targeting_metadata` function is designed to retrieve metadata about targeting data releases from a GitHub repository to see which data is present (or not). It extracts and organizes information such as file names, sizes, timestamps, and tags for a specified country and timeframe. **This metadata provides an overview of the available targeting data without downloading the actual files.**
+
+- `country_code` (*Character*):  
+  The ISO country code (e.g., `"DE"` for Germany, `"US"` for the United States).  
+
+- `timeframe` (*Character*):  
+  The timeframe for the targeting data. Acceptable values are:
+  - `"7"`: Last 7 days.
+  - `"30"`: Last 30 days.
+  - `"90"`: Last 90 days.
+
+- `base_url` (*Character*, default: `"https://github.com/favstats/meta_ad_targeting/releases/expanded_assets/"`):  
+  The base URL for the GitHub repository hosting the targeting data.
+
+
+```{r}
+
+# Retrieve metadata for Germany for the last 30 days
+metadata <- retrieve_targeting_metadata("DE", "30")
+
+print(metadata)
+
+```
+
 
 ## Get Images and Videos
 

diff --git a/README.md b/README.md
@@ -182,17 +182,63 @@ str(page_info)
 #>  $ page_verification     : chr "BLUE_VERIFIED"
 #>  $ entity_type           : chr "PERSON_PROFILE"
 #>  $ page_alias            : chr "VVD"
-#>  $ likes                 : chr "108141"
+#>  $ likes                 : chr "108142"
 #>  $ page_category         : chr "Political party"
 #>  $ ig_verification       : chr "TRUE"
 #>  $ ig_username           : chr "vvd"
-#>  $ ig_followers          : chr "42144"
+#>  $ ig_followers          : chr "42145"
 #>  $ shared_disclaimer_info: chr "[]"
 #>  $ about                 : chr "Doe mee en word lid van de VVD! 💙🧡 "
 #>  $ event                 : chr "CREATION: 2010-04-23 21:05:02"
 #>  $ no_address            : logi TRUE
 ```
 
+## `retrieve_targeting_metadata()`
+
+The `retrieve_targeting_metadata` function is designed to retrieve
+metadata about targeting data releases from a GitHub repository to see
+which data is present (or not). It extracts and organizes information
+such as file names, sizes, timestamps, and tags for a specified country
+and timeframe. **This metadata provides an overview of the available
+targeting data without downloading the actual files.**
+
+- `country_code` (*Character*):  
+  The ISO country code (e.g., `"DE"` for Germany, `"US"` for the United
+  States).
+
+- `timeframe` (*Character*):  
+  The timeframe for the targeting data. Acceptable values are:
+
+  - `"7"`: Last 7 days.
+  - `"30"`: Last 30 days.
+  - `"90"`: Last 90 days.
+
+- `base_url` (*Character*, default:
+  `"https://github.com/favstats/meta_ad_targeting/releases/expanded_assets/"`):  
+  The base URL for the GitHub repository hosting the targeting data.
+
+``` r
+
+# Retrieve metadata for Germany for the last 30 days
+metadata <- retrieve_targeting_metadata("DE", "30")
+
+print(metadata)
+#> # A tibble: 313 × 3
+#>    cntry ds         tframe      
+#>    <chr> <chr>      <chr>       
+#>  1 DE    2024-12-25 last_30_days
+#>  2 DE    2024-12-24 last_30_days
+#>  3 DE    2024-12-23 last_30_days
+#>  4 DE    2024-12-22 last_30_days
+#>  5 DE    2024-12-21 last_30_days
+#>  6 DE    2024-12-20 last_30_days
+#>  7 DE    2024-12-19 last_30_days
+#>  8 DE    2024-12-18 last_30_days
+#>  9 DE    2024-12-17 last_30_days
+#> 10 DE    2024-12-16 last_30_days
+#> # ℹ 303 more rows
+```
+
 ## Get Images and Videos
 
 The following code downloads the images and videos of a Meta ad. It also
@@ -276,4 +322,4 @@ timeseries_dat %>%
     ggplot2::theme_minimal()
 ```
 
-<img src="man/figures/README-unnamed-chunk-9-1.png" width="100%" />
+<img src="man/figures/README-unnamed-chunk-10-1.png" width="100%" />
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ @@
     .httr-oauth
     .DS_Store
     data
+    docs