digitalmethodsinitiative · stijn-uva · May 1, 2024 · May 23, 2024 · May 23, 2024 · May 23, 2024
diff --git a/.zenodo.json b/.zenodo.json
@@ -3,7 +3,7 @@
   "license": "MPL-2.0",
   "title": "Zeeschuimer",
   "upload_type": "software",
-  "version": "v1.10.1",
+  "version": "v1.11.0",
   "keywords": [
     "scraping", "data capture", "4cat", "instagram", "tiktok"
   ],

diff --git a/manifest.json b/manifest.json
@@ -3,7 +3,7 @@
   "description": "Collect data while browsing social media platforms and upload it for analysis later",
   "manifest_version": 2,
   "name": "Zeeschuimer",
-  "version": "1.10.1",
+  "version": "1.11.0",
   "homepage_url": "https://github.com/digitalmethodsinitiative/zeeschuimer",
 
   "browser_specific_settings": {

diff --git a/modules/9gag.js b/modules/9gag.js
@@ -22,19 +22,20 @@ zeeschuimer.register_module(
                 return [];
             }
             try {
-                response = JSON.parse(response.split(embedded_sigil_end)[0]);
+                const embedded_json = response.split(embedded_sigil_end)[0];
+                response = JSON.parse(embedded_json);
             } catch (e) {
                 return [];
             }
-        } else {
-            try {
-                data = JSON.parse(response);
-            } catch (SyntaxError) {
-                return [];
-            }
         }
 
-        if(!("data" in data) || typeof data["data"] !== 'object' || !("posts" in data["data"])) {
+        try {
+            data = JSON.parse(response);
+        } catch (e) {
+            return [];
+        }
+
+        if(!data || typeof data["data"] !== 'object' || !("data" in data) || !("posts" in data["data"])) {
             return [];
         }
 

diff --git a/modules/instagram.js b/modules/instagram.js
@@ -3,25 +3,12 @@ zeeschuimer.register_module(
     'instagram.com',
     function (response, source_platform_url, source_url) {
         let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, '');
-        let endpoint = source_url.split("/").slice(3).join("/").split("?")[0].split("#")[0].replace(/\/$/, '');
 
         if (!["instagram.com"].includes(domain)) {
+            console.log('ignoring non-instagram url ' + source_url);
             return [];
         }
 
-        /*let whitelisted_endpoints = [
-            "graphql/query", //live-loading @ front page
-            "api/v1/collections/list",
-            "api/v1/feed/user/33646200", //live-loading @ user page
-            "api/v1/tags/blessed/sections", //live-loading @ tag explore page
-            "api/v1/locations/214262158/sections", //live-loading @ location explore page
-            "api/v1/clips/music", //live-loading @ music overview page
-        ]
-
-        if(!whitelisted_endpoints.includes(endpoint)) {
-            return [];
-        }*/
-
         // determine what part of instagram we're working in
         // 'view' unused for now but may have some bearing on how to parse the data
         // in any case
@@ -32,11 +19,13 @@ zeeschuimer.register_module(
             view = "frontpage";
         } else if (["direct", "account", "directory", "lite", "legal"].includes(path[3])) {
             // not post listings but misc instagram views/pages
+            // console.log('ignoring misc url ' + source_url);
             return [];
         } else if (source_url.indexOf('injected_story_units') >= 0) {
             // injected ads (this URL appears on many ad blocklists!)
             // might enable if we decide to also capture ads? but not clear where these actually show up in the
             // interface...
+            // console.log('ignoring ads from ' + source_url);
             return [];
         } else if (path[3] === "explore") {
             // hashtag, location view
@@ -52,14 +41,14 @@ zeeschuimer.register_module(
         if ((source_platform_url.indexOf('reels/audio') >= 0
                 || source_platform_url.indexOf('/explore/') >= 0
             )
-            && source_url.endsWith('graphql')) {
+            && (source_url.endsWith('graphql') || source_url.endsWith('graphql/query'))) {
             // reels audio page f.ex. loads personalised reels in the background (unrelated to the audio) but doesn't
-            // seem to actually use them
+            // seem to actually use them)
 
+            // console.log('ignoring pre-cache ' + source_url);
             return [];
         }
 
-
         let datas = [];
         try {
             // if it's JSON already, just parse it
@@ -68,7 +57,7 @@ zeeschuimer.register_module(
             // data can be embedded in the HTML in these JavaScript statements
             // this is mostly used for:
             // - single post pages (e.g. https://www.instagram.com/p/C1hWCZLPQ9T/)
-            //   ✔️ confirmed working as of 2024-apr-19
+            //   ✔️ confirmed working as of 2024-aug-21
 
             let js_prefixes = [
                 "{\"require\":[[\"ScheduledServerJS\",\"handle\",null,[{\"__bbox\":{\"require\":[[\"RelayPrefetchedStreamCache\",\"next\",[],["
@@ -93,7 +82,7 @@ zeeschuimer.register_module(
                         json_bit = json_bit.substring(0, -1);
                     }
 
-                    if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGridQueryRelayPreloader') >= 0) {
+                    if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGrid') >= 0) {
                         // 'related posts', this is never what we are looking for
                         continue;
                     }
@@ -117,12 +106,14 @@ zeeschuimer.register_module(
             }
 
             if (datas.length === 0) {
+                // console.log('no datas for ' + source_url);
                 return [];
             }
         }
 
         if (datas.length === 1 && 'lightspeed_web_request_for_igd' in datas[0] && source_url.endsWith('graphql')) {
             // this is one of those background requests
+            // console.log('ignoring background request ' + source_url);
             datas = [];
         }
 
@@ -142,11 +133,11 @@ zeeschuimer.register_module(
 
                 // pages not covered:
                 // - explore (e.g. https://www.instagram.com/explore/)
-                //   ❌ as of 2024-feb-20
+                //   ❌ as of 2024-aug-21
                 // - 'tagged' pages for a user (e.g. https://www.instagram.com/steveo/tagged/)
-                //   ❌ as of 2024-feb-20
+                //   ❌ as of 2024-aug-21
                 // - 'reels' user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/reels/)
-                //   ❌ as of 2024-feb-20
+                //   ❌ as of 2024-aug-21
                 // these do not load enough post metadata (e.g. author or caption), so too different from other items
                 // to parse
                 // - suggested posts on user feed
@@ -155,19 +146,19 @@ zeeschuimer.register_module(
                 if (possible_item_lists.includes(property) || property === "items") {
                     // - posts on explore pages for specific tags (e.g. https://www.instagram.com/explore/tags/blessed/)
                     // - posts on explore pages for locations (e.g. https://www.instagram.com/explore/locations/238875664/switzerland/)
-                    //   ✔️ confirmed working as of 2024-feb-20
+                    //   ✔️ confirmed working as of 2024-aug-21
                     // - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/)
-                    //   ✔️ confirmed working as of 2024-feb-20
+                    //   ✔️ confirmed working as of 2024-aug-21
                     // - posts when opened by clicking on them
-                    //   ✔️ confirmed working as of 2024-feb-20
+                    //   ✔️ confirmed working as of 2024-aug-21
                     let items;
                     if (property === "medias" || property === "fill_items") {
                         items = obj[property].map(media => media["media"]);
                     } else if (property === "feed_items") {
                         items = obj[property].map(media => media["media_or_ad"]);
                     } else if (property === "items" && obj[property].length === obj[property].filter(i => Object.getOwnPropertyNames(i).join('') === 'media').length) {
                         // - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/)
-                        //   ✔️ confirmed working as of 2024-feb-20
+                        //   ✔️ confirmed working as of 2024-aug-21
                         if(property === 'items' && 'design' in obj) {
                             // this is loaded, but never actually displayed...
                             // seems to be a preview of reels for a given tag, but again, not
@@ -211,7 +202,7 @@ zeeschuimer.register_module(
                     }).map(node => node["media"]));
                 } else if (["xdt_api__v1__feed__user_timeline_graphql_connection"].includes(property)) {
                     // - posts on user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/)
-                    //   ✔️ confirmed working as of 2024-feb-20
+                    //   ✔️ confirmed working as of 2024-aug-21
                     edges.push(...obj[property]["edges"].filter(edge => "node" in edge).map(edge => edge["node"]).filter(node => {
                         return node !== null
                             && "id" in node
@@ -236,6 +227,7 @@ zeeschuimer.register_module(
             }
         }
 
+        // console.log('got ' + edges.length + ' via ' + source_url)
         return edges;
     }
 );
diff --git a/modules/linkedin.js b/modules/linkedin.js
@@ -10,45 +10,87 @@ zeeschuimer.register_module(
         // objects embedded in HTML are identified by this bit of text
         let items = [];
         let data = [];
+        let data_type = "";
         try {
+            if(response.indexOf('<!DOCTYPE html>') >= 0) {
+                throw new Error();
+            }
             // when dealing with JSON, just parse that JSON and process it
-            data.push(JSON.parse(response));
+            const json_data = JSON.parse(response);
+            data.push(json_data);
+            data_type = "JSON";
         } catch (e) {
             // data is not JSON, so it's probably HTML
             // HTML has data embedded in <code> tags
             // store these for processing
-            const code_regex = RegExp(/<code>(.[^<]+)<\/code>/g);
+            const code_regex = RegExp(/<code.*>([^<]+)<\/code>/g);
+
             for (const code_bit of response.matchAll(code_regex)) {
+                // console.log("Code; checking for JSON");
                 try {
                     // use he to decode from HTML entities (the way the data is embedded)
-                    data.push(JSON.parse(he.decode(code_bit)));
+                    data.push(JSON.parse(he.decode(code_bit[1].trim())));
+                    data_type = "HTML";
+                    // console.log("Found JSON in code block");
                 } catch (e) {
                 }
             }
         }
 
-        const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed"]
+        const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed", "searchDashClustersByAll"]
+        const uninteresting_list_types = ["*dashMySettings", "messagingDashMessagingSettings", "*searchDashSearchHome", "searchDashTypeaheadByGlobalTypeahead", "messagingDashAffiliatedMailboxesAll", "legoDashPageContentsByPageKeyAndSlotId", "searchDashFilterClustersByFilters"]
         for (const data_bit of data) {
             // now we have the data, try to parse it
             // is this object post data?
             let item_index = [];
+            let location = "";
             if ("data" in data_bit && "included" in data_bit) {
                 // items may be referenced as 'results' for search result pages or 'elements' for the feed
                 let item_key = '';
                 if ("*elements" in data_bit["data"]) {
                     item_index = data_bit["data"]["*elements"];
+                    location = "data.*elements";
                 } else if ("results" in data_bit["data"]) {
                     item_index = data_bit["data"]["results"];
+                    location = "data.results";
                 } else if ("data" in data_bit["data"] && Object.keys(data_bit["data"]["data"]).filter(k => eligible_list_types.includes(k))) {
                     for(const k of eligible_list_types) {
                         if(k in data_bit["data"]["data"]) {
-                            item_index = data_bit["data"]["data"][k]["*elements"];
+                            const elements_key = (data_bit["data"]["data"]['*elements'] !== undefined) ? '*elements' : 'elements';
+                            item_index = data_bit["data"]["data"][k][elements_key];
+                            location = `data.data.${k}.${elements_key}`;
+
+                            if (typeof (item_index) !== 'string' && item_index.length > 0 && item_index[0]['items'] !== undefined) {
+                                // embedded results on search page
+                                item_index = item_index[0]['items'].map(item => {
+                                    return item['item']['searchFeedUpdate']['*update'];
+                                });
+                            }
                             break;
                         }
                     }
+                    if (location === "") {
+                        // Found nothing eligible
+                        let uninteresting = false;
+                        for (const k of uninteresting_list_types) {
+                            if(k in data_bit["data"]["data"]) {
+                                uninteresting = true;
+                            }
+                        }
+
+                        if (!uninteresting) {
+                            // Possibly interesting data
+                            // console.log("No items found in data_bit:");
+                            // console.log(data_bit);
+                        }
+                        continue;
+                    }
                 } else {
-                    return [];
+                    // console.log("No items found in data:");
+                    // console.log(data_bit);
+                    continue;
                 }
+                //console.log(`Searching items at ${location} from ${data_type} data on ${source_platform_url}`);
 
                 // there is a list of objects, each with an ID
                 // and a separate list of items to display, a list of those IDs
@@ -61,6 +103,7 @@ zeeschuimer.register_module(
 
                 // then we get the objects with the IDs in the item list
                 // and that is our result set!
+                let num_items = 0;
                 for (let object_ref in item_index) {
                     let result = item_index[object_ref];
 
@@ -72,14 +115,17 @@ zeeschuimer.register_module(
                     // we are (for now?) only interested in posts, which are identified in this way
                     if (result.indexOf('urn:li:fs_updateV2:(urn:li:activity:') !== 0
                       && result.indexOf('urn:li:fsd_update:(urn:li:activity:') !== 0) {
+                        // console.log(`Skipping non-post item ${result}`);
                         continue;
                     }
 
                     let result_object = recursively_enrich(mapped_objects[result], mapped_objects);
                     result_object["id"] = result;
 
                     items.push(result_object);
+                    num_items++;
                 }
+                console.log(`Found ${num_items} items in ${location} from ${data_type} data on ${source_platform_url}`);
 
             }
         }

diff --git a/popup/interface.html b/popup/interface.html
@@ -219,7 +219,7 @@
             margin-right: 0.25em;
         }
 
-        input:not([type=checkbox]):not([type=radio]), button {
+        input:not([type=checkbox]):not([type=radio]), button, select {
             background: var(--neutral-contrast-alt);
             color: var(--accent);
             border: 2px solid var(--neutral-contrast);
@@ -265,11 +265,15 @@
             content: ' \2022';
         }
 
-        .fourcat-url-container, .zeeschuimer-master-switch, .import-container {
+        .fourcat-url-container, .zeeschuimer-master-switch, .import-container, .fourcat-pseudonymisation-container {
             text-align: center;
             margin-bottom: 0.5em;
         }
 
+        .fourcat-pseudonymisation-container select {
+            max-width: 15em;
+        }
+
         #upload-status {
             text-align: center;
         }
@@ -381,7 +385,7 @@
 <article>
     <header>
         <h1>Zeeschuimer</h1>
-        <span class="version"><a href="https://github.com/digitalmethodsinitiative/zeeschuimer/releases/tag/v1.10.1">v1.10.1</a></span>
+        <span class="version"><a href="https://github.com/digitalmethodsinitiative/zeeschuimer/releases/tag/v1.11.0">v1.11.0</a></span>
     </header>
     <section id="status">
         <h2><span>Captured data objects</span></h2>
@@ -410,6 +414,18 @@ <h2><span>Connect to 4CAT</span></h2>
                       title="The URL of the 4CAT server to upload datasets to. Make sure you're logged in to this URL with this browser.">?</span>
             </label>
         </div>
+        <div class="fourcat-pseudonymisation-container">
+            <label>
+                <span>Pseudonymise data in 4CAT:</span>
+                <select id="fourcat-pseudonymisation">
+                    <option value="anonymise">Anonymise - Replace author information with 'REDACTED'</option>
+                    <option value="pseudonymise">Pseudonymise - Replace author information with hashed values</option>
+                    <option value="none">Leave author information as-is</option>
+                </select>
+                <span class="tooltippable"
+                        title="4CAT can pseudonymise the data after importing it in a number of ways. Note that this happens AFTER uploading and non-pseudonymised data will always be sent to the 4CAT server first. 4CAT versions prior to 1.43 do not support this and will require you to manually pseudonymise after uploading.">?</span>
+            </label>
+        </div>
         <p id="upload-status"></p>
     </section>
     <section>