Add POC

2019-10-10 12:36:30 +02:00 · 2019-10-10 12:36:30 +02:00 · daa1a844db
commit daa1a844db
1 changed files with 51 additions and 0 deletions
--- a/unshort.py
+++ b/unshort.py
@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+import requests
+from urllib import parse
+
+#SHORT_URL = "http://bit.ly/33jJ7Fu" #example from twitter from zeit.de
+
+SHORT_URL = input("Please enter your short url: ")
+
+# borrowed from https://github.com/Smile4ever/Neat-URL/blob/023ac7437810144c0d5389ec50435bba028effd3/options.js#L2
+NEAT_URL_BLOCKED_PARAMS = { "neat_url_blocked_params": {
+		"type": "value",
+		"default": "utm_source, utm_medium, utm_term, utm_content, utm_campaign, utm_reader, utm_place, utm_userid, utm_cid, utm_name, utm_pubreferrer, utm_swu, utm_viz_id, ga_source, ga_medium, ga_term, ga_content, ga_campaign, ga_place, yclid, _openstat, fb_action_ids, fb_action_types, fb_ref, fb_source, fbclid, action_object_map, action_type_map, action_ref_map, gs_l, pd_rd_*@amazon.*, _encoding@amazon.*, psc@amazon.*, ved@google.*, ei@google.*, sei@google.*, gws_rd@google.*, cvid@bing.com, form@bing.com, sk@bing.com, sp@bing.com, sc@bing.com, qs@bing.com, pq@bing.com, feature@youtube.com, gclid@youtube.com, kw@youtube.com, $/ref@amazon.*, _hsenc, mkt_tok, hmb_campaign, hmb_medium, hmb_source, source@sourceforge.net, position@sourceforge.net, callback@bilibili.com, elqTrackId, elqTrack, assetType, assetId, recipientId, campaignId, siteId, tag@amazon.*,  ref_@amazon.*, pf_rd_*@amazon.*, spm@*.aliexpress.com, scm@*.aliexpress.com, aff_platform, aff_trace_key, terminal_id, _hsmi, fbclid, spReportId, spJobID, spUserID, spMailingID, utm_mailing, utm_brand, CNDID, mbid, trk, trkCampaign, sc_campaign, sc_channel, sc_content, sc_medium, sc_outcome, sc_geo, sc_country" }}
+
+CUSTOM_BLOCKED_PARAMS = ["wt_zmc"]
+
+# convert the string shipped from NeatURL into an array
+BLOCKED_PARAMS = NEAT_URL_BLOCKED_PARAMS.get("neat_url_blocked_params", {}).get("default", "").split(",")
+# and strip away whitespaces for each entry in that array
+BLOCKED_PARAMS = list(map(lambda x: x.strip(), BLOCKED_PARAMS))
+
+BLOCKED_PARAMS = BLOCKED_PARAMS + CUSTOM_BLOCKED_PARAMS
+
+req = requests.head(SHORT_URL, allow_redirects=True)
+LONG_URL = req.url
+url_components = parse.urlparse(LONG_URL)
+assert not SHORT_URL == LONG_URL
+
+query_components = {}
+if len(url_components.query) > 0:
+    query_components = parse.parse_qs(url_components.query)
+    amount_components_before = len(query_components.keys())
+    amount_dropped_components = 0
+    for BLOCKED_PARAM in BLOCKED_PARAMS:
+        if BLOCKED_PARAM in query_components.keys():
+            query_components.pop(BLOCKED_PARAM)
+            amount_dropped_components += 1
+            print("Dropping blocked param \"{}\"".format(BLOCKED_PARAM))
+    percentage_dropped = (100/amount_components_before)*amount_dropped_components
+    print("We dropped {}% of the URLs components".format(int(percentage_dropped)))
+    if percentage_dropped > 50 and not percentage_dropped == 100:
+        print("Over 50% of the query components where blocked.")
+        print("Therefore printing remaining components to maybe extend the blocklist:")
+        for component in query_components.keys():
+            print("\t - {}: \"{}\"".format(component, query_components.get(component, "--EMPTY STRING--")))
+else:
+    print("No query components to clean found in url")
+
+clean_query = parse.urlencode(query_components, doseq=True)
+clean_url = parse.urlunparse((url_components.scheme, url_components.netloc, url_components.path, url_components.params, clean_query, url_components.fragment))
+
+print(clean_url)