From bf96408b4e9ba0192d35bd8750a2e92815b657db Mon Sep 17 00:00:00 2001 From: Daniel Stenberg Date: Wed, 26 Apr 2023 09:57:03 +0200 Subject: [PATCH] offer {puny:host} and {puny:url} for --get Added two tests. Added docs. Requires >= 7.88.0 at build time. Fixes #64 --- URL-QUIRKS.md | 6 ++++++ tests.json | 30 ++++++++++++++++++++++++++++++ trurl.1 | 5 +++++ trurl.c | 24 +++++++++++++++++++++++- 4 files changed, 64 insertions(+), 1 deletion(-) diff --git a/URL-QUIRKS.md b/URL-QUIRKS.md index 0c2270f2..e3437167 100644 --- a/URL-QUIRKS.md +++ b/URL-QUIRKS.md @@ -42,6 +42,12 @@ Implemented in libcurl 7.81.0. Before this, the source formatting was kept. Run-time requirement. +## `CURLU_PUNYCODE` + +Added in libcurl 7.88.0. + +Build-time requirement. + ## Accepting % in host names The host name parser has been made stricter over time, with the most recent diff --git a/tests.json b/tests.json index 794c3bdd..2f8ed4b5 100644 --- a/tests.json +++ b/tests.json @@ -1594,6 +1594,36 @@ "returncode": 0, "stderr": "" } + }, + { + "input": { + "arguments": [ + "https://räksmörgås.se", + "-g", + "{puny:url}" + ] + }, + "minbuildtime": "7.88.0", + "expected": { + "stderr": "", + "returncode": 0, + "stdout": "https://xn--rksmrgs-5wao1o.se:443/\n" + } + }, + { + "input": { + "arguments": [ + "https://räksmörgås.se", + "-g", + "{puny:host}" + ] + }, + "minbuildtime": "7.88.0", + "expected": { + "stderr": "", + "returncode": 0, + "stdout": "xn--rksmrgs-5wao1o.se\n" + } } ] diff --git a/trurl.1 b/trurl.1 index 47e1bc6d..357671fd 100644 --- a/trurl.1 +++ b/trurl.1 @@ -76,6 +76,11 @@ You can access specific keys in the query string and out all values using the format \fB{query-all:key}\fP. This looks for 'key' case sensitively and will output all values for that key space-separated. +You can access the url and host components in their "punycoded" version, which +is how International Domain Names are converted into plain ascii, by using the +form \fB{puny:yrl}\fP and \fB{puny:host}\fP. If the host name is not using +IDN, this option provides the regular ascii name. + The "format" string supports the following backslash sequences: \&\\\\ - backslash diff --git a/trurl.c b/trurl.c index 5f0cdca8..41ac306f 100644 --- a/trurl.c +++ b/trurl.c @@ -29,6 +29,8 @@ #include #include +#include /* for setlocale() */ + #include "version.h" #ifdef _MSC_VER @@ -48,6 +50,9 @@ #else #define CURLU_ALLOW_SPACE 0 #endif +#if CURL_AT_LEAST_VERSION(7,88,0) +#define SUPPORTS_PUNYCODE +#endif #define OUTPUT_URL 0 /* default */ #define OUTPUT_SCHEME 1 @@ -480,6 +485,8 @@ static void get(struct option *op, CURLU *uh) char *cl; size_t vlen; bool urldecode = true; + bool punycode = false; + bool handled = true; end = strchr(ptr, endbyte); ptr++; /* pass the { */ if(!end) { @@ -501,10 +508,21 @@ static void get(struct option *op, CURLU *uh) showqkey(&ptr[10], end - cl - 1, urldecode, true); else if(!strncmp(ptr, "query:", 6)) showqkey(&ptr[6], end - cl - 1, urldecode, false); + else if(!strncmp(ptr, "puny:", 5)) { + punycode = true; +#ifndef SUPPORTS_PUNYCODE + warnf("Built without punycode support"); +#endif + ptr = cl + 1; + vlen = end - ptr; + handled = false; + } else errorf(ERROR_GET, "Bad --get syntax: %s", ptr); } - else { + else + handled = false; + if(!handled) { const struct var *v = comp2var(ptr, vlen); if(v) { char *nurl; @@ -512,6 +530,9 @@ static void get(struct option *op, CURLU *uh) rc = curl_url_get(uh, v->part, &nurl, CURLU_DEFAULT_PORT| CURLU_NO_DEFAULT_PORT| +#ifdef SUPPORTS_PUNYCODE + (punycode?CURLU_PUNYCODE:0)| +#endif (urldecode?CURLU_URLDECODE:0)); switch(rc) { case CURLUE_OK: @@ -1109,6 +1130,7 @@ int main(int argc, const char **argv) struct option o; struct curl_slist *node; memset(&o, 0, sizeof(o)); + setlocale(LC_ALL, ""); curl_global_init(CURL_GLOBAL_ALL); for(argc--, argv++; argc > 0; argc--, argv++) {