Skip to content

Commit

Permalink
feat: add detailed timeout error
Browse files Browse the repository at this point in the history
When a timeout occurs, now the error message is like:

```
Timeout of 1000 ms reached. Total time: 1000.527000 ms (DNS time: 0.084000 ms, TCP/SSL handshake time: 0.314000 ms, HTTP Request/Response time: 1000.014000 ms)
```

Before it used to be:

```
Timeout was reached
```

Closes #169.
  • Loading branch information
steve-chavez committed Dec 4, 2024
1 parent 05dd984 commit 868f3e4
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 7 deletions.
100 changes: 96 additions & 4 deletions src/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,13 @@ typedef struct {
int64 id;
StringInfo body;
struct curl_slist* request_headers;
int32 timeout_milliseconds;
} CurlData;

typedef struct {
char msg[256];
} curl_timeout_msg;

static size_t
body_cb(void *contents, size_t size, size_t nmemb, void *userp)
{
Expand All @@ -55,6 +60,8 @@ static void init_curl_handle(CURLM *curl_mhandle, MemoryContext curl_memctx, int
cdata->id = id;
cdata->body = makeStringInfo();

cdata->timeout_milliseconds = timeout_milliseconds;

if (!headersBin.isnull) {
ArrayType *pgHeaders = DatumGetArrayTypeP(headersBin.value);
struct curl_slist *request_headers = NULL;
Expand Down Expand Up @@ -105,7 +112,7 @@ static void init_curl_handle(CURLM *curl_mhandle, MemoryContext curl_memctx, int
CURL_EZ_SETOPT(curl_ez_handle, CURLOPT_HEADER, 0L);
CURL_EZ_SETOPT(curl_ez_handle, CURLOPT_URL, url);
CURL_EZ_SETOPT(curl_ez_handle, CURLOPT_HTTPHEADER, cdata->request_headers);
CURL_EZ_SETOPT(curl_ez_handle, CURLOPT_TIMEOUT_MS, timeout_milliseconds);
CURL_EZ_SETOPT(curl_ez_handle, CURLOPT_TIMEOUT_MS, cdata->timeout_milliseconds);
CURL_EZ_SETOPT(curl_ez_handle, CURLOPT_PRIVATE, cdata);
CURL_EZ_SETOPT(curl_ez_handle, CURLOPT_FOLLOWLOCATION, true);
if (log_min_messages <= DEBUG2)
Expand Down Expand Up @@ -164,16 +171,101 @@ void delete_expired_responses(char *ttl, int batch_size){
CommitTransactionCommand();
}

static void insert_failure_response(CURLcode return_code, int64 id){

/*
* Show a more detailed error message when a timeout happens, which includes the DNS, TCP/SSL handshake and HTTP request/response time. An example message is like:
*
* "Timeout of 800 ms reached. Total time: 801.159000 ms (DNS time: 73.407000 ms, TCP/SSL handshake time: 677.256000 ms, HTTP Request/Respose time: 50.103000 ms)"
*
* Curl allows to calculate the above by applying substractions on some internal timings. Refer to https://blog.cloudflare.com/a-question-of-timing/ for an explanation of these timings.
*
* There are extra considerations:
*
* - If a step surpasses the request timeout, say the TCP handshake (CURLINFO_CONNECT_TIME), its given timing is 0.
* However the TCP handshake duration can still be determined by using the total time (CURLINFO_TOTAL_TIME).
* We want to show at which phase the timeout occurred.
*
* - If a step is omitted, say an SSL handshake (CURLINFO_APPCONNECT_TIME) on non-HTTPS requests, its given timing is 0.
*
* - The pretransfer time (CURLINFO_PRETRANSFER_TIME) is greater than 0 when the HTTP request step starts.
*/
static curl_timeout_msg detailed_timeout_strerror(CURL *ez_handle, int32 timeout_milliseconds){
double namelookup; CURL_EZ_GETINFO(ez_handle, CURLINFO_NAMELOOKUP_TIME, &namelookup);
double appconnect; CURL_EZ_GETINFO(ez_handle, CURLINFO_APPCONNECT_TIME, &appconnect);
double connect; CURL_EZ_GETINFO(ez_handle, CURLINFO_CONNECT_TIME, &connect);
double pretransfer; CURL_EZ_GETINFO(ez_handle, CURLINFO_PRETRANSFER_TIME, &pretransfer);
double starttransfer; CURL_EZ_GETINFO(ez_handle, CURLINFO_STARTTRANSFER_TIME, &starttransfer);
double total; CURL_EZ_GETINFO(ez_handle, CURLINFO_TOTAL_TIME, &total);

elog(DEBUG2, "The curl timings are time_namelookup: %f, time_connect: %f, time_appconnect: %f, time_pretransfer: %f, time_starttransfer: %f, time_total: %f",
namelookup, connect, appconnect, pretransfer, starttransfer, total);

// Steps at which the request timed out
bool timedout_at_dns = namelookup == 0 && connect == 0; // if DNS time is 0 and no TCP occurred, it timed out at the DNS step
bool timedout_at_handshake = pretransfer == 0; // pretransfer determines if the HTTP step started, if 0 no HTTP ocurred and thus the timeout occurred at TCP or SSL handshake step
bool timedout_at_http = pretransfer > 0; // The HTTP step did start and the timeout occurred here

// Calculate the steps times
double _dns_time =
timedout_at_dns ?
total: // get the total since namelookup will be 0 because of the timeout
timedout_at_handshake ?
namelookup:
timedout_at_http ?
namelookup:
0;

double _handshake_time =
timedout_at_dns ?
0:
timedout_at_handshake ?
total - namelookup: // connect or appconnect will be 0 because of the timeout, get the total - DNS step time
timedout_at_http ?
(connect - namelookup) + // TCP handshake time
(appconnect > 0 ? (appconnect - connect): 0): // SSL handshake time. Prevent a negative here which can happen when no SSL is involved (plain HTTP request) and appconnect is 0
0;

double _http_time =
timedout_at_dns ?
0:
timedout_at_handshake ?
0:
timedout_at_http ?
total - pretransfer:
0;

// convert seconds to milliseconds
double dns_time_ms = _dns_time * 1000;
double handshake_time_ms = _handshake_time * 1000;
double http_time_ms = _http_time * 1000;
double total_time_ms = total * 1000;

// build the error message
curl_timeout_msg result = {.msg = {}};
sprintf(result.msg,
"Timeout of %d ms reached. Total time: %f ms (DNS time: %f ms, TCP/SSL handshake time: %f ms, HTTP Request/Response time: %f ms)",
timeout_milliseconds, total_time_ms, dns_time_ms, handshake_time_ms, http_time_ms
);
return result;
}

static void insert_failure_response(CURL *ez_handle, CURLcode return_code, int64 id, int32 timeout_milliseconds){
StartTransactionCommand();
PushActiveSnapshot(GetTransactionSnapshot());
SPI_connect();

const char* error_msg;
if (return_code == CURLE_OPERATION_TIMEDOUT){
error_msg = detailed_timeout_strerror(ez_handle, timeout_milliseconds).msg;
} else {
error_msg = curl_easy_strerror(return_code);
}

int ret_code = SPI_execute_with_args("\
insert into net._http_response(id, error_msg) values ($1, $2)",
2,
(Oid[]){INT8OID, CSTRINGOID},
(Datum[]){Int64GetDatum(id), CStringGetDatum(curl_easy_strerror(return_code))},
(Datum[]){Int64GetDatum(id), CStringGetDatum(error_msg)},
NULL, false, 1);

if (ret_code != SPI_OK_INSERT)
Expand Down Expand Up @@ -319,7 +411,7 @@ void insert_curl_responses(LoopState *lstate, MemoryContext curl_memctx){
CURL_EZ_GETINFO(ez_handle, CURLINFO_PRIVATE, &cdata);

if (return_code != CURLE_OK) {
insert_failure_response(return_code, cdata->id);
insert_failure_response(ez_handle, return_code, cdata->id, cdata->timeout_milliseconds);
} else {
char *contentType;
CURL_EZ_GETINFO(ez_handle, CURLINFO_CONTENT_TYPE, &contentType);
Expand Down
2 changes: 1 addition & 1 deletion src/event.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
#define WAIT_USE_KQUEUE
#else
#error "no event wait implementation available"
#error "no event loop implementation available"
#endif

#ifdef WAIT_USE_EPOLL
Expand Down
92 changes: 90 additions & 2 deletions test/test_http_timeout.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import time

import pytest
import re
from sqlalchemy import text

def test_http_get_timeout_reached(sess):
Expand All @@ -26,7 +27,94 @@ def test_http_get_timeout_reached(sess):
{"request_id": request_id},
).fetchone()

assert response == u'Timeout was reached'
assert response.startswith("Timeout of 5000 ms reached")


def test_http_detailed_timeout(sess):
"""the timeout shows a detailed error msg"""

pattern = r"""
Total\stime:\s* # Match 'Total time:' with optional spaces
(?P<A>[0-9]*\.?[0-9]+)\s*ms # Capture A (Total time)
\s*\(DNS\stime:\s* # Match '(DNS time:' with optional spaces
(?P<B>[0-9]*\.?[0-9]+)\s*ms,\s* # Capture B (DNS time)
TCP/SSL\shandshake\stime:\s* # Match 'TCP/SSL handshake time:' with spaces
(?P<C>[0-9]*\.?[0-9]+)\s*ms,\s* # Capture C (TCP/SSL handshake time)
HTTP\sRequest/Response\stime:\s* # Match 'HTTP Request/Response time:' with spaces
(?P<D>[0-9]*\.?[0-9]+)\s*ms\) # Capture D (HTTP Request/Response time)
"""

regex = re.compile(pattern, re.VERBOSE)

# TODO make this work locally
# Timeout at the DNS step
(request_id,) = sess.execute(text(
"""
select net.http_get('https://news.ycombinator.com/', timeout_milliseconds := 10);
"""
)).fetchone()

sess.commit()

# wait for timeout
time.sleep(1.1)

(response,) = sess.execute(
text(
"""
select error_msg from net._http_response where id = :request_id;
"""
),
{"request_id": request_id},
).fetchone()

match = regex.search(response)

total_time = float(match.group('A'))
dns_time = float(match.group('B'))
tcp_ssl_time = float(match.group('C'))
http_time = float(match.group('D'))

assert total_time == dns_time
assert tcp_ssl_time == 0
assert http_time == 0

# TODO add a TCP/SSL handshake timeout test
# This can be done locally on Linux with `sudo tc qdisc add dev lo root netem delay 500ms` and
# select net.http_get(url := 'http://localhost:8080/pathological', timeout_milliseconds := 1000);

# Timeout at the HTTP step
(request_id,) = sess.execute(text(
"""
select net.http_get(url := 'http://localhost:8080/pathological?delay=1', timeout_milliseconds := 1000)
"""
)).fetchone()

sess.commit()

# wait for timeout
time.sleep(2.1)

(response,) = sess.execute(
text(
"""
select error_msg from net._http_response where id = :request_id;
"""
),
{"request_id": request_id},
).fetchone()

match = regex.search(response)

total_time = float(match.group('A'))
dns_time = float(match.group('B'))
tcp_ssl_time = float(match.group('C'))
http_time = float(match.group('D'))

assert total_time > 0
assert dns_time > 0
assert tcp_ssl_time > 0
assert http_time > 0

def test_http_get_succeed_with_gt_timeout(sess):
"""net.http_get with timeout succeeds when the timeout is greater than the slow reply response time"""
Expand Down Expand Up @@ -75,7 +163,7 @@ def test_many_slow_mixed_with_fast(sess):
"""
select
count(*) filter (where error_msg is null and status_code = 200) as request_successes,
count(*) filter (where error_msg is not null and error_msg like 'Timeout was reached') as request_timeouts
count(*) filter (where error_msg is not null and error_msg like 'Timeout of 1000 ms reached%') as request_timeouts
from net._http_response;
"""
)).fetchone()
Expand Down

0 comments on commit 868f3e4

Please sign in to comment.