Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Controls for pseudonymisation #32

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .zenodo.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"license": "MPL-2.0",
"title": "Zeeschuimer",
"upload_type": "software",
"version": "v1.10.1",
"version": "v1.11.0",
"keywords": [
"scraping", "data capture", "4cat", "instagram", "tiktok"
],
Expand Down
2 changes: 1 addition & 1 deletion manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"description": "Collect data while browsing social media platforms and upload it for analysis later",
"manifest_version": 2,
"name": "Zeeschuimer",
"version": "1.10.1",
"version": "1.11.0",
"homepage_url": "https://github.com/digitalmethodsinitiative/zeeschuimer",

"browser_specific_settings": {
Expand Down
17 changes: 9 additions & 8 deletions modules/9gag.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,20 @@ zeeschuimer.register_module(
return [];
}
try {
response = JSON.parse(response.split(embedded_sigil_end)[0]);
const embedded_json = response.split(embedded_sigil_end)[0];
response = JSON.parse(embedded_json);
} catch (e) {
return [];
}
} else {
try {
data = JSON.parse(response);
} catch (SyntaxError) {
return [];
}
}

if(!("data" in data) || typeof data["data"] !== 'object' || !("posts" in data["data"])) {
try {
data = JSON.parse(response);
} catch (e) {
return [];
}

if(!data || typeof data["data"] !== 'object' || !("data" in data) || !("posts" in data["data"])) {
return [];
}

Expand Down
46 changes: 19 additions & 27 deletions modules/instagram.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,12 @@ zeeschuimer.register_module(
'instagram.com',
function (response, source_platform_url, source_url) {
let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, '');
let endpoint = source_url.split("/").slice(3).join("/").split("?")[0].split("#")[0].replace(/\/$/, '');

if (!["instagram.com"].includes(domain)) {
console.log('ignoring non-instagram url ' + source_url);
return [];
}

/*let whitelisted_endpoints = [
"graphql/query", //live-loading @ front page
"api/v1/collections/list",
"api/v1/feed/user/33646200", //live-loading @ user page
"api/v1/tags/blessed/sections", //live-loading @ tag explore page
"api/v1/locations/214262158/sections", //live-loading @ location explore page
"api/v1/clips/music", //live-loading @ music overview page
]

if(!whitelisted_endpoints.includes(endpoint)) {
return [];
}*/

// determine what part of instagram we're working in
// 'view' unused for now but may have some bearing on how to parse the data
// in any case
Expand All @@ -32,11 +19,13 @@ zeeschuimer.register_module(
view = "frontpage";
} else if (["direct", "account", "directory", "lite", "legal"].includes(path[3])) {
// not post listings but misc instagram views/pages
// console.log('ignoring misc url ' + source_url);
return [];
} else if (source_url.indexOf('injected_story_units') >= 0) {
// injected ads (this URL appears on many ad blocklists!)
// might enable if we decide to also capture ads? but not clear where these actually show up in the
// interface...
// console.log('ignoring ads from ' + source_url);
return [];
} else if (path[3] === "explore") {
// hashtag, location view
Expand All @@ -52,14 +41,14 @@ zeeschuimer.register_module(
if ((source_platform_url.indexOf('reels/audio') >= 0
|| source_platform_url.indexOf('/explore/') >= 0
)
&& source_url.endsWith('graphql')) {
&& (source_url.endsWith('graphql') || source_url.endsWith('graphql/query'))) {
// reels audio page f.ex. loads personalised reels in the background (unrelated to the audio) but doesn't
// seem to actually use them
// seem to actually use them)

// console.log('ignoring pre-cache ' + source_url);
return [];
}


let datas = [];
try {
// if it's JSON already, just parse it
Expand All @@ -68,7 +57,7 @@ zeeschuimer.register_module(
// data can be embedded in the HTML in these JavaScript statements
// this is mostly used for:
// - single post pages (e.g. https://www.instagram.com/p/C1hWCZLPQ9T/)
// ✔️ confirmed working as of 2024-apr-19
// ✔️ confirmed working as of 2024-aug-21

let js_prefixes = [
"{\"require\":[[\"ScheduledServerJS\",\"handle\",null,[{\"__bbox\":{\"require\":[[\"RelayPrefetchedStreamCache\",\"next\",[],["
Expand All @@ -93,7 +82,7 @@ zeeschuimer.register_module(
json_bit = json_bit.substring(0, -1);
}

if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGridQueryRelayPreloader') >= 0) {
if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGrid') >= 0) {
// 'related posts', this is never what we are looking for
continue;
}
Expand All @@ -117,12 +106,14 @@ zeeschuimer.register_module(
}

if (datas.length === 0) {
// console.log('no datas for ' + source_url);
return [];
}
}

if (datas.length === 1 && 'lightspeed_web_request_for_igd' in datas[0] && source_url.endsWith('graphql')) {
// this is one of those background requests
// console.log('ignoring background request ' + source_url);
datas = [];
}

Expand All @@ -142,11 +133,11 @@ zeeschuimer.register_module(

// pages not covered:
// - explore (e.g. https://www.instagram.com/explore/)
// ❌ as of 2024-feb-20
// ❌ as of 2024-aug-21
// - 'tagged' pages for a user (e.g. https://www.instagram.com/steveo/tagged/)
// ❌ as of 2024-feb-20
// ❌ as of 2024-aug-21
// - 'reels' user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/reels/)
// ❌ as of 2024-feb-20
// ❌ as of 2024-aug-21
// these do not load enough post metadata (e.g. author or caption), so too different from other items
// to parse
// - suggested posts on user feed
Expand All @@ -155,19 +146,19 @@ zeeschuimer.register_module(
if (possible_item_lists.includes(property) || property === "items") {
// - posts on explore pages for specific tags (e.g. https://www.instagram.com/explore/tags/blessed/)
// - posts on explore pages for locations (e.g. https://www.instagram.com/explore/locations/238875664/switzerland/)
// ✔️ confirmed working as of 2024-feb-20
// ✔️ confirmed working as of 2024-aug-21
// - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/)
// ✔️ confirmed working as of 2024-feb-20
// ✔️ confirmed working as of 2024-aug-21
// - posts when opened by clicking on them
// ✔️ confirmed working as of 2024-feb-20
// ✔️ confirmed working as of 2024-aug-21
let items;
if (property === "medias" || property === "fill_items") {
items = obj[property].map(media => media["media"]);
} else if (property === "feed_items") {
items = obj[property].map(media => media["media_or_ad"]);
} else if (property === "items" && obj[property].length === obj[property].filter(i => Object.getOwnPropertyNames(i).join('') === 'media').length) {
// - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/)
// ✔️ confirmed working as of 2024-feb-20
// ✔️ confirmed working as of 2024-aug-21
if(property === 'items' && 'design' in obj) {
// this is loaded, but never actually displayed...
// seems to be a preview of reels for a given tag, but again, not
Expand Down Expand Up @@ -211,7 +202,7 @@ zeeschuimer.register_module(
}).map(node => node["media"]));
} else if (["xdt_api__v1__feed__user_timeline_graphql_connection"].includes(property)) {
// - posts on user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/)
// ✔️ confirmed working as of 2024-feb-20
// ✔️ confirmed working as of 2024-aug-21
edges.push(...obj[property]["edges"].filter(edge => "node" in edge).map(edge => edge["node"]).filter(node => {
return node !== null
&& "id" in node
Expand All @@ -236,6 +227,7 @@ zeeschuimer.register_module(
}
}

// console.log('got ' + edges.length + ' via ' + source_url)
return edges;
}
);
58 changes: 52 additions & 6 deletions modules/linkedin.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,87 @@ zeeschuimer.register_module(
// objects embedded in HTML are identified by this bit of text
let items = [];
let data = [];
let data_type = "";
try {
if(response.indexOf('<!DOCTYPE html>') >= 0) {
throw new Error();
}
// when dealing with JSON, just parse that JSON and process it
data.push(JSON.parse(response));
const json_data = JSON.parse(response);
data.push(json_data);
data_type = "JSON";
} catch (e) {
// data is not JSON, so it's probably HTML
// HTML has data embedded in <code> tags
// store these for processing
const code_regex = RegExp(/<code>(.[^<]+)<\/code>/g);
const code_regex = RegExp(/<code.*>([^<]+)<\/code>/g);

for (const code_bit of response.matchAll(code_regex)) {
// console.log("Code; checking for JSON");
try {
// use he to decode from HTML entities (the way the data is embedded)
data.push(JSON.parse(he.decode(code_bit)));
data.push(JSON.parse(he.decode(code_bit[1].trim())));
data_type = "HTML";
// console.log("Found JSON in code block");
} catch (e) {
}
}
}

const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed"]
const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed", "searchDashClustersByAll"]
const uninteresting_list_types = ["*dashMySettings", "messagingDashMessagingSettings", "*searchDashSearchHome", "searchDashTypeaheadByGlobalTypeahead", "messagingDashAffiliatedMailboxesAll", "legoDashPageContentsByPageKeyAndSlotId", "searchDashFilterClustersByFilters"]
for (const data_bit of data) {
// now we have the data, try to parse it
// is this object post data?
let item_index = [];
let location = "";
if ("data" in data_bit && "included" in data_bit) {
// items may be referenced as 'results' for search result pages or 'elements' for the feed
let item_key = '';
if ("*elements" in data_bit["data"]) {
item_index = data_bit["data"]["*elements"];
location = "data.*elements";
} else if ("results" in data_bit["data"]) {
item_index = data_bit["data"]["results"];
location = "data.results";
} else if ("data" in data_bit["data"] && Object.keys(data_bit["data"]["data"]).filter(k => eligible_list_types.includes(k))) {
for(const k of eligible_list_types) {
if(k in data_bit["data"]["data"]) {
item_index = data_bit["data"]["data"][k]["*elements"];
const elements_key = (data_bit["data"]["data"]['*elements'] !== undefined) ? '*elements' : 'elements';
item_index = data_bit["data"]["data"][k][elements_key];
location = `data.data.${k}.${elements_key}`;

if (typeof (item_index) !== 'string' && item_index.length > 0 && item_index[0]['items'] !== undefined) {
// embedded results on search page
item_index = item_index[0]['items'].map(item => {
return item['item']['searchFeedUpdate']['*update'];
});
}
break;
}
}
if (location === "") {
// Found nothing eligible
let uninteresting = false;
for (const k of uninteresting_list_types) {
if(k in data_bit["data"]["data"]) {
uninteresting = true;
}
}

if (!uninteresting) {
// Possibly interesting data
// console.log("No items found in data_bit:");
// console.log(data_bit);
}
continue;
}
} else {
return [];
// console.log("No items found in data:");
// console.log(data_bit);
continue;
}
//console.log(`Searching items at ${location} from ${data_type} data on ${source_platform_url}`);

// there is a list of objects, each with an ID
// and a separate list of items to display, a list of those IDs
Expand All @@ -61,6 +103,7 @@ zeeschuimer.register_module(

// then we get the objects with the IDs in the item list
// and that is our result set!
let num_items = 0;
for (let object_ref in item_index) {
let result = item_index[object_ref];

Expand All @@ -72,14 +115,17 @@ zeeschuimer.register_module(
// we are (for now?) only interested in posts, which are identified in this way
if (result.indexOf('urn:li:fs_updateV2:(urn:li:activity:') !== 0
&& result.indexOf('urn:li:fsd_update:(urn:li:activity:') !== 0) {
// console.log(`Skipping non-post item ${result}`);
continue;
}

let result_object = recursively_enrich(mapped_objects[result], mapped_objects);
result_object["id"] = result;

items.push(result_object);
num_items++;
}
console.log(`Found ${num_items} items in ${location} from ${data_type} data on ${source_platform_url}`);

}
}
Expand Down
22 changes: 19 additions & 3 deletions popup/interface.html
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@
margin-right: 0.25em;
}

input:not([type=checkbox]):not([type=radio]), button {
input:not([type=checkbox]):not([type=radio]), button, select {
background: var(--neutral-contrast-alt);
color: var(--accent);
border: 2px solid var(--neutral-contrast);
Expand Down Expand Up @@ -265,11 +265,15 @@
content: ' \2022';
}

.fourcat-url-container, .zeeschuimer-master-switch, .import-container {
.fourcat-url-container, .zeeschuimer-master-switch, .import-container, .fourcat-pseudonymisation-container {
text-align: center;
margin-bottom: 0.5em;
}

.fourcat-pseudonymisation-container select {
max-width: 15em;
}

#upload-status {
text-align: center;
}
Expand Down Expand Up @@ -381,7 +385,7 @@
<article>
<header>
<h1>Zeeschuimer</h1>
<span class="version"><a href="https://github.com/digitalmethodsinitiative/zeeschuimer/releases/tag/v1.10.1">v1.10.1</a></span>
<span class="version"><a href="https://github.com/digitalmethodsinitiative/zeeschuimer/releases/tag/v1.11.0">v1.11.0</a></span>
</header>
<section id="status">
<h2><span>Captured data objects</span></h2>
Expand Down Expand Up @@ -410,6 +414,18 @@ <h2><span>Connect to 4CAT</span></h2>
title="The URL of the 4CAT server to upload datasets to. Make sure you're logged in to this URL with this browser.">?</span>
</label>
</div>
<div class="fourcat-pseudonymisation-container">
<label>
<span>Pseudonymise data in 4CAT:</span>
<select id="fourcat-pseudonymisation">
<option value="anonymise">Anonymise - Replace author information with 'REDACTED'</option>
<option value="pseudonymise">Pseudonymise - Replace author information with hashed values</option>
<option value="none">Leave author information as-is</option>
</select>
<span class="tooltippable"
title="4CAT can pseudonymise the data after importing it in a number of ways. Note that this happens AFTER uploading and non-pseudonymised data will always be sent to the 4CAT server first. 4CAT versions prior to 1.43 do not support this and will require you to manually pseudonymise after uploading.">?</span>
</label>
</div>
<p id="upload-status"></p>
</section>
<section>
Expand Down
Loading