This commit is contained in:
nicksinger 2019-10-10 12:36:30 +02:00
commit daa1a844db
1 changed files with 51 additions and 0 deletions

51
unshort.py Normal file
View File

@ -0,0 +1,51 @@
#!/usr/bin/env python3
import requests
from urllib import parse
#SHORT_URL = "http://bit.ly/33jJ7Fu" #example from twitter from zeit.de
SHORT_URL = input("Please enter your short url: ")
# borrowed from https://github.com/Smile4ever/Neat-URL/blob/023ac7437810144c0d5389ec50435bba028effd3/options.js#L2
NEAT_URL_BLOCKED_PARAMS = { "neat_url_blocked_params": {
"type": "value",
"default": "utm_source, utm_medium, utm_term, utm_content, utm_campaign, utm_reader, utm_place, utm_userid, utm_cid, utm_name, utm_pubreferrer, utm_swu, utm_viz_id, ga_source, ga_medium, ga_term, ga_content, ga_campaign, ga_place, yclid, _openstat, fb_action_ids, fb_action_types, fb_ref, fb_source, fbclid, action_object_map, action_type_map, action_ref_map, gs_l, pd_rd_*@amazon.*, _encoding@amazon.*, psc@amazon.*, ved@google.*, ei@google.*, sei@google.*, gws_rd@google.*, cvid@bing.com, form@bing.com, sk@bing.com, sp@bing.com, sc@bing.com, qs@bing.com, pq@bing.com, feature@youtube.com, gclid@youtube.com, kw@youtube.com, $/ref@amazon.*, _hsenc, mkt_tok, hmb_campaign, hmb_medium, hmb_source, source@sourceforge.net, position@sourceforge.net, callback@bilibili.com, elqTrackId, elqTrack, assetType, assetId, recipientId, campaignId, siteId, tag@amazon.*, ref_@amazon.*, pf_rd_*@amazon.*, spm@*.aliexpress.com, scm@*.aliexpress.com, aff_platform, aff_trace_key, terminal_id, _hsmi, fbclid, spReportId, spJobID, spUserID, spMailingID, utm_mailing, utm_brand, CNDID, mbid, trk, trkCampaign, sc_campaign, sc_channel, sc_content, sc_medium, sc_outcome, sc_geo, sc_country" }}
CUSTOM_BLOCKED_PARAMS = ["wt_zmc"]
# convert the string shipped from NeatURL into an array
BLOCKED_PARAMS = NEAT_URL_BLOCKED_PARAMS.get("neat_url_blocked_params", {}).get("default", "").split(",")
# and strip away whitespaces for each entry in that array
BLOCKED_PARAMS = list(map(lambda x: x.strip(), BLOCKED_PARAMS))
BLOCKED_PARAMS = BLOCKED_PARAMS + CUSTOM_BLOCKED_PARAMS
req = requests.head(SHORT_URL, allow_redirects=True)
LONG_URL = req.url
url_components = parse.urlparse(LONG_URL)
assert not SHORT_URL == LONG_URL
query_components = {}
if len(url_components.query) > 0:
query_components = parse.parse_qs(url_components.query)
amount_components_before = len(query_components.keys())
amount_dropped_components = 0
for BLOCKED_PARAM in BLOCKED_PARAMS:
if BLOCKED_PARAM in query_components.keys():
query_components.pop(BLOCKED_PARAM)
amount_dropped_components += 1
print("Dropping blocked param \"{}\"".format(BLOCKED_PARAM))
percentage_dropped = (100/amount_components_before)*amount_dropped_components
print("We dropped {}% of the URLs components".format(int(percentage_dropped)))
if percentage_dropped > 50 and not percentage_dropped == 100:
print("Over 50% of the query components where blocked.")
print("Therefore printing remaining components to maybe extend the blocklist:")
for component in query_components.keys():
print("\t - {}: \"{}\"".format(component, query_components.get(component, "--EMPTY STRING--")))
else:
print("No query components to clean found in url")
clean_query = parse.urlencode(query_components, doseq=True)
clean_url = parse.urlunparse((url_components.scheme, url_components.netloc, url_components.path, url_components.params, clean_query, url_components.fragment))
print(clean_url)