Skip to content

Commit

Permalink
Split the analyze command into separate files, using the same pattern…
Browse files Browse the repository at this point in the history
… as the extract command (#360)

* Split the analyze command into separate files, using the same pattern as the extract command

* isort
  • Loading branch information
palewire authored Jan 11, 2023
1 parent 52fe65c commit df13673
Show file tree
Hide file tree
Showing 6 changed files with 238 additions and 191 deletions.
3 changes: 3 additions & 0 deletions newshomepages/analyze/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .cli import cli_group as cli

__all__ = ("cli",)
4 changes: 4 additions & 0 deletions newshomepages/analyze/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .cli import cli_group

if __name__ == "__main__":
cli_group()
16 changes: 16 additions & 0 deletions newshomepages/analyze/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import click

from .drudge import cli as cli_drudge
from .lighthouse import cli as cli_lighthouse
from .us_right_wing import cli as cli_us_right_wing

cli_group = click.CommandCollection(
sources=[
cli_drudge,
cli_lighthouse,
cli_us_right_wing,
]
)

if __name__ == "__main__":
cli_group()
193 changes: 2 additions & 191 deletions newshomepages/analyze.py → newshomepages/analyze/drudge.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
from rich import print
from rich.progress import track

from . import utils
from .. import utils


@click.group()
def cli():
"""Analyze our data extracts."""
"""Analyze the Drudge Report."""
pass


Expand Down Expand Up @@ -333,192 +333,3 @@ def drudge_hyperlinks(output_dir: str = "./"):
links_df.sort_values(
["domain", "earliest_date", "text"], ascending=[True, False, True]
).to_csv(output_path / "drudge-hyperlinks-analysis.csv", index=False)


@cli.command()
@click.option("-o", "--output-dir", "output_dir", default="./")
def us_right_wing_hyperlinks(output_dir: str = "./"):
"""Analyze U.S. Right Wing hyperlinks."""
print(":abacus: Analyzing U.S. Right Wing hyperlinks")
warnings.simplefilter("ignore")

# Set the output path
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)

# Read in our 90 day sample of hyperlinks
df = utils.get_extract_df(
"us-right-wing-hyperlinks-sample.csv",
usecols=[
"handle",
"file_name",
"date",
"text",
"url",
],
dtype=str,
parse_dates=["date"],
)

# Trim the strings
df["text"] = (
df.text.str.strip()
.str.replace(r"\s{2,}", " ", regex=True)
.str.replace(r"\n", " ")
)
df["url"] = df.url.str.strip()

# Guess links with `storysniffer`
sniffer = storysniffer.StorySniffer()
links_df = (
df.sort_values("date")
.drop_duplicates(["url"], keep="first")
.groupby(["handle", "text", "url"])
.agg({"date": "min"})
.rename(columns={"date": "earliest_date"})
.reset_index()
)
links_df["is_story"] = links_df.apply(
lambda x: sniffer.guess(x["url"], text=x["text"]), axis=1
)

# Make some corrections
text_black_list = [
"COMMENTS",
"COMMENT",
]
links_df.loc[links_df.text.str.upper().isin(text_black_list), "is_story"] = False
whitelist = [
r"\.(substack|theankler|commonsense|thedispatch).(com|news)/p/",
r"^https://time.com/\d{5,}/*",
r"^https://studyfinds.org/*.{5,}",
r"^https://www.studyfinds.org/*.{5,}",
r"^https://*.bbc.com/news/*.{5,}",
r"^https://www.jpost.com/breaking-news/*.{5,}",
r"^https://www.jpost.com/[a-z]{5,}/*.{5,}",
r"^https://*.braintomorrow.com/*.{5,}"
r"^https://finance.yahoo.com/news/*.{5,}",
r"^https://www.vice.com/en/article/*.{5,}",
r"^https://news.yahoo.com/*.{5,}",
r"^https://www.nationalreview.com/corner/*"
r"^https://www.nationalreview.com/the-morning-jolt/*",
]
for s in whitelist:
links_df.loc[links_df.url.str.contains(s, regex=True), "is_story"] = True

# Cut anything that doesn't start with http
links_df.loc[~links_df.url.str.startswith("http"), "is_story"] = False

# Cut anything that appears too much
n = len(df.file_name.unique())
url_counts = df.groupby("url").size().rename("n").reset_index()
too_much = url_counts[url_counts.n >= n * 0.5]
links_df.loc[links_df.url.isin(too_much.url), "is_story"] = False

# Parse out the domain
links_df["domain"] = links_df.url.apply(
lambda x: f"{tldextract.extract(x).domain}.{tldextract.extract(x).suffix}"
)

# Write the result
links_df.sort_values(
["domain", "earliest_date", "text"], ascending=[True, False, True]
).to_csv(output_path / "us-right-wing-hyperlinks-analysis.csv", index=False)


@cli.command()
@click.option("-o", "--output-dir", "output_dir", default="./")
def lighthouse(output_dir: str = "./"):
"""Analyze Lighthouse scores."""
print(":abacus: Analyzing Lighthouse scores")

# Set the output path
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)

# Read in our seven day sample for all sites
df = utils.get_extract_df(
"lighthouse-sample.csv",
usecols=[
"handle",
"file_name",
"date",
"performance",
"accessibility",
"seo",
"best_practices",
],
dtype={
"handle": str,
"file_name": str,
"performance": float,
"accessibility": float,
"seo": float,
"best_practices": float,
},
parse_dates=["date"],
)

# Exclude null scores
notnull_df = df[~pd.isnull(df.performance)].copy()

# Exclude any sites with less than 10 observations
observations_by_site = notnull_df.groupby("handle").size().rename("n").reset_index()
not_qualified = observations_by_site[observations_by_site.n < 10]
qualified_df = notnull_df[~notnull_df.handle.isin(not_qualified.handle)].copy()

# Exclude blacklisted sites
blacklist = ["tass_agency"]
qualified_df = qualified_df[~qualified_df.handle.isin(blacklist)].copy()

# Aggregate descriptive statistics for each metric
agg_df = qualified_df.groupby("handle").agg(
{
"performance": ["count", "median", "mean", "min", "max", "std"],
"accessibility": ["count", "median", "mean", "min", "max", "std"],
"seo": ["count", "median", "mean", "min", "max", "std"],
"best_practices": ["count", "median", "mean", "min", "max", "std"],
}
)

# Flatten the dataframe
flat_df = agg_df.copy()
flat_df.columns = ["_".join(col) for col in flat_df.columns]

# Classify scores
flat_df["performance_color"] = flat_df.performance_median.apply(_color_code)
flat_df["accessibility_color"] = flat_df.accessibility_median.apply(_color_code)
flat_df["seo_color"] = flat_df.seo_median.apply(_color_code)
flat_df["best_practices_color"] = flat_df.best_practices_median.apply(_color_code)

# Rank scores
flat_df["performance_rank"] = flat_df.performance_median.rank(
ascending=False, method="min"
)
flat_df["accessibility_rank"] = flat_df.accessibility_median.rank(
ascending=False, method="min"
)
flat_df["seo_rank"] = flat_df.seo_median.rank(ascending=False, method="min")
flat_df["best_practices_rank"] = flat_df.best_practices_median.rank(
ascending=False, method="min"
)

# Write the results
flat_df.reset_index().to_csv(output_path / "lighthouse-analysis.csv", index=False)


def _color_code(val):
"""Return the classification of a metric according to Google's system.
Source: https://developer.chrome.com/docs/lighthouse/performance/performance-scoring/
"""
if val >= 0.9:
return "green"
elif val >= 0.5:
return "orange"
else:
return "red"


if __name__ == "__main__":
cli()
107 changes: 107 additions & 0 deletions newshomepages/analyze/lighthouse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from pathlib import Path

import click
import pandas as pd
from rich import print

from .. import utils


@click.group()
def cli():
"""Analyze Lighthouse reports."""
pass


@cli.command()
@click.option("-o", "--output-dir", "output_dir", default="./")
def lighthouse(output_dir: str = "./"):
"""Analyze Lighthouse scores."""
print(":abacus: Analyzing Lighthouse scores")

# Set the output path
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)

# Read in our seven day sample for all sites
df = utils.get_extract_df(
"lighthouse-sample.csv",
usecols=[
"handle",
"file_name",
"date",
"performance",
"accessibility",
"seo",
"best_practices",
],
dtype={
"handle": str,
"file_name": str,
"performance": float,
"accessibility": float,
"seo": float,
"best_practices": float,
},
parse_dates=["date"],
)

# Exclude null scores
notnull_df = df[~pd.isnull(df.performance)].copy()

# Exclude any sites with less than 10 observations
observations_by_site = notnull_df.groupby("handle").size().rename("n").reset_index()
not_qualified = observations_by_site[observations_by_site.n < 10]
qualified_df = notnull_df[~notnull_df.handle.isin(not_qualified.handle)].copy()

# Exclude blacklisted sites
blacklist = ["tass_agency"]
qualified_df = qualified_df[~qualified_df.handle.isin(blacklist)].copy()

# Aggregate descriptive statistics for each metric
agg_df = qualified_df.groupby("handle").agg(
{
"performance": ["count", "median", "mean", "min", "max", "std"],
"accessibility": ["count", "median", "mean", "min", "max", "std"],
"seo": ["count", "median", "mean", "min", "max", "std"],
"best_practices": ["count", "median", "mean", "min", "max", "std"],
}
)

# Flatten the dataframe
flat_df = agg_df.copy()
flat_df.columns = ["_".join(col) for col in flat_df.columns]

# Classify scores
flat_df["performance_color"] = flat_df.performance_median.apply(_color_code)
flat_df["accessibility_color"] = flat_df.accessibility_median.apply(_color_code)
flat_df["seo_color"] = flat_df.seo_median.apply(_color_code)
flat_df["best_practices_color"] = flat_df.best_practices_median.apply(_color_code)

# Rank scores
flat_df["performance_rank"] = flat_df.performance_median.rank(
ascending=False, method="min"
)
flat_df["accessibility_rank"] = flat_df.accessibility_median.rank(
ascending=False, method="min"
)
flat_df["seo_rank"] = flat_df.seo_median.rank(ascending=False, method="min")
flat_df["best_practices_rank"] = flat_df.best_practices_median.rank(
ascending=False, method="min"
)

# Write the results
flat_df.reset_index().to_csv(output_path / "lighthouse-analysis.csv", index=False)


def _color_code(val):
"""Return the classification of a metric according to Google's system.
Source: https://developer.chrome.com/docs/lighthouse/performance/performance-scoring/
"""
if val >= 0.9:
return "green"
elif val >= 0.5:
return "orange"
else:
return "red"
Loading

0 comments on commit df13673

Please sign in to comment.