Add POC
This commit is contained in:
commit
daa1a844db
1 changed files with 51 additions and 0 deletions
51
unshort.py
Normal file
51
unshort.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import requests
|
||||||
|
from urllib import parse
|
||||||
|
|
||||||
|
#SHORT_URL = "http://bit.ly/33jJ7Fu" #example from twitter from zeit.de
|
||||||
|
|
||||||
|
SHORT_URL = input("Please enter your short url: ")
|
||||||
|
|
||||||
|
# borrowed from https://github.com/Smile4ever/Neat-URL/blob/023ac7437810144c0d5389ec50435bba028effd3/options.js#L2
|
||||||
|
NEAT_URL_BLOCKED_PARAMS = { "neat_url_blocked_params": {
|
||||||
|
"type": "value",
|
||||||
|
"default": "utm_source, utm_medium, utm_term, utm_content, utm_campaign, utm_reader, utm_place, utm_userid, utm_cid, utm_name, utm_pubreferrer, utm_swu, utm_viz_id, ga_source, ga_medium, ga_term, ga_content, ga_campaign, ga_place, yclid, _openstat, fb_action_ids, fb_action_types, fb_ref, fb_source, fbclid, action_object_map, action_type_map, action_ref_map, gs_l, pd_rd_*@amazon.*, _encoding@amazon.*, psc@amazon.*, ved@google.*, ei@google.*, sei@google.*, gws_rd@google.*, cvid@bing.com, form@bing.com, sk@bing.com, sp@bing.com, sc@bing.com, qs@bing.com, pq@bing.com, feature@youtube.com, gclid@youtube.com, kw@youtube.com, $/ref@amazon.*, _hsenc, mkt_tok, hmb_campaign, hmb_medium, hmb_source, source@sourceforge.net, position@sourceforge.net, callback@bilibili.com, elqTrackId, elqTrack, assetType, assetId, recipientId, campaignId, siteId, tag@amazon.*, ref_@amazon.*, pf_rd_*@amazon.*, spm@*.aliexpress.com, scm@*.aliexpress.com, aff_platform, aff_trace_key, terminal_id, _hsmi, fbclid, spReportId, spJobID, spUserID, spMailingID, utm_mailing, utm_brand, CNDID, mbid, trk, trkCampaign, sc_campaign, sc_channel, sc_content, sc_medium, sc_outcome, sc_geo, sc_country" }}
|
||||||
|
|
||||||
|
CUSTOM_BLOCKED_PARAMS = ["wt_zmc"]
|
||||||
|
|
||||||
|
# convert the string shipped from NeatURL into an array
|
||||||
|
BLOCKED_PARAMS = NEAT_URL_BLOCKED_PARAMS.get("neat_url_blocked_params", {}).get("default", "").split(",")
|
||||||
|
# and strip away whitespaces for each entry in that array
|
||||||
|
BLOCKED_PARAMS = list(map(lambda x: x.strip(), BLOCKED_PARAMS))
|
||||||
|
|
||||||
|
BLOCKED_PARAMS = BLOCKED_PARAMS + CUSTOM_BLOCKED_PARAMS
|
||||||
|
|
||||||
|
req = requests.head(SHORT_URL, allow_redirects=True)
|
||||||
|
LONG_URL = req.url
|
||||||
|
url_components = parse.urlparse(LONG_URL)
|
||||||
|
assert not SHORT_URL == LONG_URL
|
||||||
|
|
||||||
|
query_components = {}
|
||||||
|
if len(url_components.query) > 0:
|
||||||
|
query_components = parse.parse_qs(url_components.query)
|
||||||
|
amount_components_before = len(query_components.keys())
|
||||||
|
amount_dropped_components = 0
|
||||||
|
for BLOCKED_PARAM in BLOCKED_PARAMS:
|
||||||
|
if BLOCKED_PARAM in query_components.keys():
|
||||||
|
query_components.pop(BLOCKED_PARAM)
|
||||||
|
amount_dropped_components += 1
|
||||||
|
print("Dropping blocked param \"{}\"".format(BLOCKED_PARAM))
|
||||||
|
percentage_dropped = (100/amount_components_before)*amount_dropped_components
|
||||||
|
print("We dropped {}% of the URLs components".format(int(percentage_dropped)))
|
||||||
|
if percentage_dropped > 50 and not percentage_dropped == 100:
|
||||||
|
print("Over 50% of the query components where blocked.")
|
||||||
|
print("Therefore printing remaining components to maybe extend the blocklist:")
|
||||||
|
for component in query_components.keys():
|
||||||
|
print("\t - {}: \"{}\"".format(component, query_components.get(component, "--EMPTY STRING--")))
|
||||||
|
else:
|
||||||
|
print("No query components to clean found in url")
|
||||||
|
|
||||||
|
clean_query = parse.urlencode(query_components, doseq=True)
|
||||||
|
clean_url = parse.urlunparse((url_components.scheme, url_components.netloc, url_components.path, url_components.params, clean_query, url_components.fragment))
|
||||||
|
|
||||||
|
print(clean_url)
|
Loading…
Reference in a new issue