335 lines
9.9 KiB
Plaintext
Executable File
335 lines
9.9 KiB
Plaintext
Executable File
import parsePath from 'parse-path';
|
|
|
|
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
|
|
const DATA_URL_DEFAULT_MIME_TYPE = 'text/plain';
|
|
const DATA_URL_DEFAULT_CHARSET = 'us-ascii';
|
|
|
|
const testParameter = (name, filters) => filters.some(filter => filter instanceof RegExp ? filter.test(name) : filter === name);
|
|
|
|
const normalizeDataURL = (urlString, {stripHash}) => {
|
|
const match = /^data:(?<type>[^,]*?),(?<data>[^#]*?)(?:#(?<hash>.*))?$/.exec(urlString);
|
|
|
|
if (!match) {
|
|
throw new Error(`Invalid URL: ${urlString}`);
|
|
}
|
|
|
|
let {type, data, hash} = match.groups;
|
|
const mediaType = type.split(';');
|
|
hash = stripHash ? '' : hash;
|
|
|
|
let isBase64 = false;
|
|
if (mediaType[mediaType.length - 1] === 'base64') {
|
|
mediaType.pop();
|
|
isBase64 = true;
|
|
}
|
|
|
|
// Lowercase MIME type
|
|
const mimeType = (mediaType.shift() || '').toLowerCase();
|
|
const attributes = mediaType
|
|
.map(attribute => {
|
|
let [key, value = ''] = attribute.split('=').map(string => string.trim());
|
|
|
|
// Lowercase `charset`
|
|
if (key === 'charset') {
|
|
value = value.toLowerCase();
|
|
|
|
if (value === DATA_URL_DEFAULT_CHARSET) {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
return `${key}${value ? `=${value}` : ''}`;
|
|
})
|
|
.filter(Boolean);
|
|
|
|
const normalizedMediaType = [
|
|
...attributes,
|
|
];
|
|
|
|
if (isBase64) {
|
|
normalizedMediaType.push('base64');
|
|
}
|
|
|
|
if (normalizedMediaType.length > 0 || (mimeType && mimeType !== DATA_URL_DEFAULT_MIME_TYPE)) {
|
|
normalizedMediaType.unshift(mimeType);
|
|
}
|
|
|
|
return `data:${normalizedMediaType.join(';')},${isBase64 ? data.trim() : data}${hash ? `#${hash}` : ''}`;
|
|
};
|
|
|
|
function normalizeUrl(urlString, options) {
|
|
options = {
|
|
defaultProtocol: 'http:',
|
|
normalizeProtocol: true,
|
|
forceHttp: false,
|
|
forceHttps: false,
|
|
stripAuthentication: true,
|
|
stripHash: false,
|
|
stripTextFragment: true,
|
|
stripWWW: true,
|
|
removeQueryParameters: [/^utm_\w+/i],
|
|
removeTrailingSlash: true,
|
|
removeSingleSlash: true,
|
|
removeDirectoryIndex: false,
|
|
sortQueryParameters: true,
|
|
...options,
|
|
};
|
|
|
|
urlString = urlString.trim();
|
|
|
|
// Data URL
|
|
if (/^data:/i.test(urlString)) {
|
|
return normalizeDataURL(urlString, options);
|
|
}
|
|
|
|
if (/^view-source:/i.test(urlString)) {
|
|
throw new Error('`view-source:` is not supported as it is a non-standard protocol');
|
|
}
|
|
|
|
const hasRelativeProtocol = urlString.startsWith('//');
|
|
const isRelativeUrl = !hasRelativeProtocol && /^\.*\//.test(urlString);
|
|
|
|
// Prepend protocol
|
|
if (!isRelativeUrl) {
|
|
urlString = urlString.replace(/^(?!(?:\w+:)?\/\/)|^\/\//, options.defaultProtocol);
|
|
}
|
|
|
|
const urlObject = new URL(urlString);
|
|
|
|
if (options.forceHttp && options.forceHttps) {
|
|
throw new Error('The `forceHttp` and `forceHttps` options cannot be used together');
|
|
}
|
|
|
|
if (options.forceHttp && urlObject.protocol === 'https:') {
|
|
urlObject.protocol = 'http:';
|
|
}
|
|
|
|
if (options.forceHttps && urlObject.protocol === 'http:') {
|
|
urlObject.protocol = 'https:';
|
|
}
|
|
|
|
// Remove auth
|
|
if (options.stripAuthentication) {
|
|
urlObject.username = '';
|
|
urlObject.password = '';
|
|
}
|
|
|
|
// Remove hash
|
|
if (options.stripHash) {
|
|
urlObject.hash = '';
|
|
} else if (options.stripTextFragment) {
|
|
urlObject.hash = urlObject.hash.replace(/#?:~:text.*?$/i, '');
|
|
}
|
|
|
|
// Remove duplicate slashes if not preceded by a protocol
|
|
// NOTE: This could be implemented using a single negative lookbehind
|
|
// regex, but we avoid that to maintain compatibility with older js engines
|
|
// which do not have support for that feature.
|
|
if (urlObject.pathname) {
|
|
// TODO: Replace everything below with `urlObject.pathname = urlObject.pathname.replace(/(?<!\b[a-z][a-z\d+\-.]{1,50}:)\/{2,}/g, '/');` when Safari supports negative lookbehind.
|
|
|
|
// Split the string by occurrences of this protocol regex, and perform
|
|
// duplicate-slash replacement on the strings between those occurrences
|
|
// (if any).
|
|
const protocolRegex = /\b[a-z][a-z\d+\-.]{1,50}:\/\//g;
|
|
|
|
let lastIndex = 0;
|
|
let result = '';
|
|
for (;;) {
|
|
const match = protocolRegex.exec(urlObject.pathname);
|
|
if (!match) {
|
|
break;
|
|
}
|
|
|
|
const protocol = match[0];
|
|
const protocolAtIndex = match.index;
|
|
const intermediate = urlObject.pathname.slice(lastIndex, protocolAtIndex);
|
|
|
|
result += intermediate.replace(/\/{2,}/g, '/');
|
|
result += protocol;
|
|
lastIndex = protocolAtIndex + protocol.length;
|
|
}
|
|
|
|
const remnant = urlObject.pathname.slice(lastIndex, urlObject.pathname.length);
|
|
result += remnant.replace(/\/{2,}/g, '/');
|
|
|
|
urlObject.pathname = result;
|
|
}
|
|
|
|
// Decode URI octets
|
|
if (urlObject.pathname) {
|
|
try {
|
|
urlObject.pathname = decodeURI(urlObject.pathname);
|
|
} catch {}
|
|
}
|
|
|
|
// Remove directory index
|
|
if (options.removeDirectoryIndex === true) {
|
|
options.removeDirectoryIndex = [/^index\.[a-z]+$/];
|
|
}
|
|
|
|
if (Array.isArray(options.removeDirectoryIndex) && options.removeDirectoryIndex.length > 0) {
|
|
let pathComponents = urlObject.pathname.split('/');
|
|
const lastComponent = pathComponents[pathComponents.length - 1];
|
|
|
|
if (testParameter(lastComponent, options.removeDirectoryIndex)) {
|
|
pathComponents = pathComponents.slice(0, -1);
|
|
urlObject.pathname = pathComponents.slice(1).join('/') + '/';
|
|
}
|
|
}
|
|
|
|
if (urlObject.hostname) {
|
|
// Remove trailing dot
|
|
urlObject.hostname = urlObject.hostname.replace(/\.$/, '');
|
|
|
|
// Remove `www.`
|
|
if (options.stripWWW && /^www\.(?!www\.)[a-z\-\d]{1,63}\.[a-z.\-\d]{2,63}$/.test(urlObject.hostname)) {
|
|
// Each label should be max 63 at length (min: 1).
|
|
// Source: https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_host_names
|
|
// Each TLD should be up to 63 characters long (min: 2).
|
|
// It is technically possible to have a single character TLD, but none currently exist.
|
|
urlObject.hostname = urlObject.hostname.replace(/^www\./, '');
|
|
}
|
|
}
|
|
|
|
// Remove query unwanted parameters
|
|
if (Array.isArray(options.removeQueryParameters)) {
|
|
// eslint-disable-next-line unicorn/no-useless-spread -- We are intentionally spreading to get a copy.
|
|
for (const key of [...urlObject.searchParams.keys()]) {
|
|
if (testParameter(key, options.removeQueryParameters)) {
|
|
urlObject.searchParams.delete(key);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (options.removeQueryParameters === true) {
|
|
urlObject.search = '';
|
|
}
|
|
|
|
// Sort query parameters
|
|
if (options.sortQueryParameters) {
|
|
urlObject.searchParams.sort();
|
|
|
|
// Calling `.sort()` encodes the search parameters, so we need to decode them again.
|
|
try {
|
|
urlObject.search = decodeURIComponent(urlObject.search);
|
|
} catch {}
|
|
}
|
|
|
|
if (options.removeTrailingSlash) {
|
|
urlObject.pathname = urlObject.pathname.replace(/\/$/, '');
|
|
}
|
|
|
|
const oldUrlString = urlString;
|
|
|
|
// Take advantage of many of the Node `url` normalizations
|
|
urlString = urlObject.toString();
|
|
|
|
if (!options.removeSingleSlash && urlObject.pathname === '/' && !oldUrlString.endsWith('/') && urlObject.hash === '') {
|
|
urlString = urlString.replace(/\/$/, '');
|
|
}
|
|
|
|
// Remove ending `/` unless removeSingleSlash is false
|
|
if ((options.removeTrailingSlash || urlObject.pathname === '/') && urlObject.hash === '' && options.removeSingleSlash) {
|
|
urlString = urlString.replace(/\/$/, '');
|
|
}
|
|
|
|
// Restore relative protocol, if applicable
|
|
if (hasRelativeProtocol && !options.normalizeProtocol) {
|
|
urlString = urlString.replace(/^http:\/\//, '//');
|
|
}
|
|
|
|
// Remove http/https
|
|
if (options.stripProtocol) {
|
|
urlString = urlString.replace(/^(?:https?:)?\/\//, '');
|
|
}
|
|
|
|
return urlString;
|
|
}
|
|
|
|
// Dependencies
|
|
|
|
/**
|
|
* parseUrl
|
|
* Parses the input url.
|
|
*
|
|
* **Note**: This *throws* if invalid urls are provided.
|
|
*
|
|
* @name parseUrl
|
|
* @function
|
|
* @param {String} url The input url.
|
|
* @param {Boolean|Object} normalize Whether to normalize the url or not.
|
|
* Default is `false`. If `true`, the url will
|
|
* be normalized. If an object, it will be the
|
|
* options object sent to [`normalize-url`](https://github.com/sindresorhus/normalize-url).
|
|
*
|
|
* For SSH urls, normalize won't work.
|
|
*
|
|
* @return {Object} An object containing the following fields:
|
|
*
|
|
* - `protocols` (Array): An array with the url protocols (usually it has one element).
|
|
* - `protocol` (String): The first protocol, `"ssh"` (if the url is a ssh url) or `"file"`.
|
|
* - `port` (null|Number): The domain port.
|
|
* - `resource` (String): The url domain (including subdomains).
|
|
* - `user` (String): The authentication user (usually for ssh urls).
|
|
* - `pathname` (String): The url pathname.
|
|
* - `hash` (String): The url hash.
|
|
* - `search` (String): The url querystring value.
|
|
* - `href` (String): The input url.
|
|
* - `query` (Object): The url querystring, parsed as object.
|
|
* - `parse_failed` (Boolean): Whether the parsing failed or not.
|
|
*/
|
|
const parseUrl = (url, normalize = false) => {
|
|
|
|
// Constants
|
|
const GIT_RE = /^(?:([a-z_][a-z0-9_-]{0,31})@|https?:\/\/)([\w\.\-@]+)[\/:]([\~,\.\w,\-,\_,\/]+?(?:\.git|\/)?)$/;
|
|
|
|
const throwErr = msg => {
|
|
const err = new Error(msg);
|
|
err.subject_url = url;
|
|
throw err
|
|
};
|
|
|
|
if (typeof url !== "string" || !url.trim()) {
|
|
throwErr("Invalid url.");
|
|
}
|
|
|
|
if (url.length > parseUrl.MAX_INPUT_LENGTH) {
|
|
throwErr("Input exceeds maximum length. If needed, change the value of parseUrl.MAX_INPUT_LENGTH.");
|
|
}
|
|
|
|
if (normalize) {
|
|
if (typeof normalize !== "object") {
|
|
normalize = {
|
|
stripHash: false
|
|
};
|
|
}
|
|
url = normalizeUrl(url, normalize);
|
|
}
|
|
|
|
const parsed = parsePath(url);
|
|
|
|
// Potential git-ssh urls
|
|
if (parsed.parse_failed) {
|
|
const matched = parsed.href.match(GIT_RE);
|
|
|
|
if (matched) {
|
|
parsed.protocols = ["ssh"];
|
|
parsed.protocol = "ssh";
|
|
parsed.resource = matched[2];
|
|
parsed.host = matched[2];
|
|
parsed.user = matched[1];
|
|
parsed.pathname = `/${matched[3]}`;
|
|
parsed.parse_failed = false;
|
|
} else {
|
|
throwErr("URL parsing failed.");
|
|
}
|
|
}
|
|
|
|
return parsed;
|
|
};
|
|
|
|
parseUrl.MAX_INPUT_LENGTH = 2048;
|
|
|
|
export { parseUrl as default };
|