- From: Niels Provos <provos@citi.umich.edu>
- Date: Mon, 21 May 2001 15:30:01 -0400 (EDT)
- To: www-lib@w3.org
Hi, I have encountered some bugs in libwww-5.3.2 running on an OpenBSD 2.9 system. If a HTTP/1.0 webserver does not send Content-Type and Content-Length, all data is read from the socket until the server closes the connection. However, no parsing happens. The state machine gets HT_CLOSED and transitions to HTTP_RECOVER_PIPE. To get the data parsed, I introduced the following hack: diff -urbw ../../libwww/Library/src/HTMIME.c src/HTMIME.c --- ../../libwww/Library/src/HTMIME.c Mon Dec 18 12:00:56 2000 +++ src/HTMIME.c Sat May 19 21:31:18 2001 @@ -363,6 +365,15 @@ start=b, end=b; if (me->EOLstate == EOL_END) { /* EOL_END */ if (ret == HT_OK) { + /* Another gross hack Niels */ + HTResponse *response = me->response; + long length = HTResponse_length(response); + if (HTResponse_format(response) == WWW_UNKNOWN && + length < 0) { + fprintf(stderr, __FUNCTION__": to WWW_HTML\n"); + HTResponse_setFormat(response, WWW_HTML); + + } b++, l--; ret = pumpData(me); HTNet_addBytesRead(me->net, 1); To get the bytes actually counted, I use the following diff: diff -ur ../../libwww/Library/src/HTMIME.c src/HTMIME.c --- ../../libwww/Library/src/HTMIME.c Mon Dec 18 12:00:56 2000 +++ src/HTMIME.c Mon May 21 15:16:27 2001 @@ -442,15 +453,13 @@ if (cbf) (*cbf)(me->request, HT_PROG_DONE, HT_MSG_NULL, NULL, NULL, NULL); return HT_LOADED; } - } else { - if ((status = (*me->target->isa->put_block)(me->target, b, l)) < 0) - return status; - HTNet_addBytesRead(net, l); - HTHost_setConsumed(HTNet_host(net), l); - return status; - } + } } - return (*me->target->isa->put_block)(me->target, b, l); + if ((status = (*me->target->isa->put_block)(me->target, b, l)) < 0) + return status; + HTNet_addBytesRead(net, l); + HTHost_setConsumed(HTNet_host(net), l); + return status; } else { HTAlertCallback * cbf = HTAlert_find(HT_PROG_DONE); if (cbf) (*cbf)(me->request, HT_PROG_DONE, HT_MSG_NULL, NULL, NULL, NULL); diff -ur ../../libwww/Library/src/HTTP.c src/HTTP.c --- ../../libwww/Library/src/HTTP.c Mon Oct 30 05:04:23 2000 +++ src/HTTP.c Sat May 19 21:16:09 2001 @@ -1255,9 +1255,15 @@ continue; } else if (status==HT_LOADED) http->state = http->next; /* Jump to next state (OK or ERROR) */ - else if (status==HT_CLOSED) - http->state = HTTP_RECOVER_PIPE; - else if (status == HT_ERROR) + else if (status==HT_CLOSED) { + /* XXX - Niels */ + if (HTHost_version(host) == HTTP_10) { + HTAnchor_setLength(anchor, HTNet_bytesRead(net) - HTNet_headerBytesRead(net)); + http->state = http->next; + } else + + http->state = HTTP_RECOVER_PIPE; + } else if (status == HT_ERROR) http->state = HTTP_KILL_PIPE; else http->state = HTTP_ERROR; Mind you, these were the quickest fixes that I came up with to get on with my work. Now, however, I am stuck with bugs that I encounter when using the robot. At some point in the crawl, it just stops. There are no messages on stderr, and no atexit(3) functions are called. A system call trace reveals the following problems. The library gets EAGAIN on read or connect. Instead of waiting in select for the file descriptor to get ready, connect and read are called over an over again. This is an excerpt from kdump on a BSD system. 31246 webbot 990461300.835916 CALL socket(0x2,0x1,0x6) 31246 webbot 990461300.835956 RET socket 0x10 31246 webbot 990461300.835972 CALL setsockopt(0x10,0x6,0x1,0xdfbfcfa4,0x4) 31246 webbot 990461300.835993 RET setsockopt 0 31246 webbot 990461300.836006 CALL fcntl(0x10,0x3,0) 31246 webbot 990461300.836020 RET fcntl 0x2 31246 webbot 990461300.836032 CALL fcntl(0x10,0x4,0x6) 31246 webbot 990461300.836046 RET fcntl 0 31246 webbot 990461300.836191 CALL getsockopt(0x10,0xffff,0x1001,0xdfbfcf00,0xdfbfcf04) 31246 webbot 990461300.836221 RET getsockopt 0 31246 webbot 990461300.836246 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.836367 RET connect -1 errno 36 31246 webbot 990461300.836417 CALL gettimeofday(0xdfbfcec0,0) 31246 webbot 990461300.836435 RET gettimeofday 0 31246 webbot 990461300.836470 CALL gettimeofday(0xdfbfd070,0) 31246 webbot 990461300.836488 RET gettimeofday 0 31246 webbot 990461300.836588 CALL write(0x3,0x11d000,0x81) 31246 webbot 990461300.836633 GIO fd 3 wrote 129 bytes 31246 webbot 990461300.836648 RET write 0x81 31246 webbot 990461300.836689 CALL write(0x6,0x124000,0x8a) 31246 webbot 990461300.836722 GIO fd 6 wrote 138 bytes 31246 webbot 990461300.836736 RET write 0x8a 31246 webbot 990461300.838472 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838503 RET connect -1 errno 37 31246 webbot 990461300.838520 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838534 RET connect -1 errno 37 31246 webbot 990461300.838548 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838562 RET connect -1 errno 37 31246 webbot 990461300.838576 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838591 RET connect -1 errno 37 31246 webbot 990461300.838605 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838619 RET connect -1 errno 37 31246 webbot 990461300.838633 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838665 RET connect -1 errno 37 31246 webbot 990461300.838680 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838694 RET connect -1 errno 37 31246 webbot 990461300.838708 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838722 RET connect -1 errno 37 31246 webbot 990461300.838736 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838752 RET connect -1 errno 37 31246 webbot 990461300.838766 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838781 RET connect -1 errno 37 31246 webbot 990461300.838795 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838810 RET connect -1 errno 37 31246 webbot 990461300.838824 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.838839 RET connect -1 errno 37 [...] 31246 webbot 990461300.941052 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.941066 RET connect -1 errno 37 31246 webbot 990461300.941215 CALL connect(0x10,0x10186c,0x10) 31246 webbot 990461300.941242 RET connect -1 errno 56 31246 webbot 990461300.942225 CALL gettimeofday(0xdfbfd0c0,0) 31246 webbot 990461300.942260 RET gettimeofday 0 31246 webbot 990461300.942225 CALL gettimeofday(0xdfbfd0c0,0) 31246 webbot 990461300.942260 RET gettimeofday 0 31246 webbot 990461300.942276 CALL write(0x10,0x110000,0xf5) 31246 webbot 990461300.942351 GIO fd 16 wrote 245 bytes 31246 webbot 990461300.942395 RET write 0xf5 31246 webbot 990461300.942422 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461300.942443 RET read -1 errno 35 31246 webbot 990461300.942467 CALL gettimeofday(0xdfbfd040,0) 31246 webbot 990461300.942484 RET gettimeofday 0 31246 webbot 990461300.942513 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461300.942532 RET read -1 errno 35 31246 webbot 990461300.942547 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461300.942562 RET read -1 errno 35 31246 webbot 990461300.942576 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461300.942591 RET read -1 errno 35 31246 webbot 990461300.942606 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461300.942621 RET read -1 errno 35 31246 webbot 990461300.942635 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461300.942650 RET read -1 errno 35 31246 webbot 990461300.942664 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461300.942680 RET read -1 errno 35 31246 webbot 990461300.942694 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461300.942710 RET read -1 errno 35 [...] 31246 webbot 990461301.053835 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461301.053850 RET read -1 errno 35 31246 webbot 990461301.054151 CALL read(0x10,0x107018,0x8000) 31246 webbot 990461301.054207 GIO fd 16 read 131 bytes 31246 webbot 990461301.054226 RET read 0x83 31246 webbot 990461301.054465 CALL connect(0xf,0xe936c,0x10) 31246 webbot 990461301.054501 RET connect -1 errno 56 I do not understand, why the EventLoop does not go back to call select(2). Looking at the -current CVS respository, it does not seem that any of the changes might have fixed these bugs. Any ideas? Regards, Niels Provos.
Received on Tuesday, 22 May 2001 04:23:43 UTC