bug when no content-type and content-length

Hi,

I have encountered some bugs in libwww-5.3.2 running on an OpenBSD 2.9
system.

If a HTTP/1.0 webserver does not send Content-Type and Content-Length,
all data is read from the socket until the server closes the
connection.  However, no parsing happens.  The state machine gets
HT_CLOSED and transitions to HTTP_RECOVER_PIPE.

To get the data parsed, I introduced the following hack:

diff -urbw ../../libwww/Library/src/HTMIME.c src/HTMIME.c
--- ../../libwww/Library/src/HTMIME.c	Mon Dec 18 12:00:56 2000
+++ src/HTMIME.c	Sat May 19 21:31:18 2001
@@ -363,6 +365,15 @@
 		start=b, end=b;
 		if (me->EOLstate == EOL_END) {		/* EOL_END */
 		    if (ret == HT_OK) {
+		        /* Another gross hack Niels */
+			HTResponse *response = me->response;
+			long length = HTResponse_length(response);
+			if (HTResponse_format(response) == WWW_UNKNOWN &&
+			    length < 0) {
+				fprintf(stderr, __FUNCTION__": to WWW_HTML\n");
+				HTResponse_setFormat(response, WWW_HTML);
+				
+			}
 			b++, l--;
 			ret = pumpData(me);
 			HTNet_addBytesRead(me->net, 1);

To get the bytes actually counted, I use the following diff:

diff -ur ../../libwww/Library/src/HTMIME.c src/HTMIME.c
--- ../../libwww/Library/src/HTMIME.c	Mon Dec 18 12:00:56 2000
+++ src/HTMIME.c	Mon May 21 15:16:27 2001
@@ -442,15 +453,13 @@
                     if (cbf) (*cbf)(me->request, HT_PROG_DONE, HT_MSG_NULL, NULL, NULL, NULL);
                     return HT_LOADED;
                 }
-            } else {
-		if ((status = (*me->target->isa->put_block)(me->target, b, l)) < 0)
-		    return status;
-		HTNet_addBytesRead(net, l);
-		HTHost_setConsumed(HTNet_host(net), l);
-		return status;
-	    }
+            }
 	}
-	return (*me->target->isa->put_block)(me->target, b, l);
+	if ((status = (*me->target->isa->put_block)(me->target, b, l)) < 0)
+		return status;
+	HTNet_addBytesRead(net, l);
+	HTHost_setConsumed(HTNet_host(net), l);
+	return status;
     } else {
         HTAlertCallback * cbf = HTAlert_find(HT_PROG_DONE);
         if (cbf) (*cbf)(me->request, HT_PROG_DONE, HT_MSG_NULL, NULL, NULL, NULL);
diff -ur ../../libwww/Library/src/HTTP.c src/HTTP.c
--- ../../libwww/Library/src/HTTP.c	Mon Oct 30 05:04:23 2000
+++ src/HTTP.c	Sat May 19 21:16:09 2001
@@ -1255,9 +1255,15 @@
 		      continue;
 		  } else if (status==HT_LOADED)
 		      http->state = http->next;	/* Jump to next state (OK or ERROR) */
-		  else if (status==HT_CLOSED)
-		      http->state = HTTP_RECOVER_PIPE;
-		  else if (status == HT_ERROR)
+		  else if (status==HT_CLOSED) {
+			  /* XXX - Niels */
+			  if (HTHost_version(host) == HTTP_10) {
+				  HTAnchor_setLength(anchor, HTNet_bytesRead(net) - HTNet_headerBytesRead(net));
+				  http->state = http->next;
+			  } else
+			  
+				  http->state = HTTP_RECOVER_PIPE;
+		  } else if (status == HT_ERROR)
 		      http->state = HTTP_KILL_PIPE;
 		  else
 		      http->state = HTTP_ERROR;

Mind you, these were the quickest fixes that I came up with to get on
with my work.  Now, however, I am stuck with bugs that I encounter when
using the robot.  At some point in the crawl, it just stops.  There
are no messages on stderr, and no atexit(3) functions are called.

A system call trace reveals the following problems.  The library gets
EAGAIN on read or connect.  Instead of waiting in select for the file
descriptor to get ready, connect and read are called over an over
again.  This is an excerpt from kdump on a BSD system.

 31246 webbot   990461300.835916 CALL  socket(0x2,0x1,0x6)
 31246 webbot   990461300.835956 RET   socket 0x10
 31246 webbot   990461300.835972 CALL  setsockopt(0x10,0x6,0x1,0xdfbfcfa4,0x4)
 31246 webbot   990461300.835993 RET   setsockopt 0
 31246 webbot   990461300.836006 CALL  fcntl(0x10,0x3,0)
 31246 webbot   990461300.836020 RET   fcntl 0x2
 31246 webbot   990461300.836032 CALL  fcntl(0x10,0x4,0x6)
 31246 webbot   990461300.836046 RET   fcntl 0
 31246 webbot   990461300.836191 CALL  getsockopt(0x10,0xffff,0x1001,0xdfbfcf00,0xdfbfcf04)
 31246 webbot   990461300.836221 RET   getsockopt 0
 31246 webbot   990461300.836246 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.836367 RET   connect -1 errno 36
 31246 webbot   990461300.836417 CALL  gettimeofday(0xdfbfcec0,0)
 31246 webbot   990461300.836435 RET   gettimeofday 0
 31246 webbot   990461300.836470 CALL  gettimeofday(0xdfbfd070,0)
 31246 webbot   990461300.836488 RET   gettimeofday 0
 31246 webbot   990461300.836588 CALL  write(0x3,0x11d000,0x81)
 31246 webbot   990461300.836633 GIO   fd 3 wrote 129 bytes
 31246 webbot   990461300.836648 RET   write 0x81
 31246 webbot   990461300.836689 CALL  write(0x6,0x124000,0x8a)
 31246 webbot   990461300.836722 GIO   fd 6 wrote 138 bytes
 31246 webbot   990461300.836736 RET   write 0x8a
 31246 webbot   990461300.838472 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838503 RET   connect -1 errno 37
 31246 webbot   990461300.838520 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838534 RET   connect -1 errno 37
 31246 webbot   990461300.838548 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838562 RET   connect -1 errno 37
 31246 webbot   990461300.838576 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838591 RET   connect -1 errno 37
 31246 webbot   990461300.838605 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838619 RET   connect -1 errno 37
 31246 webbot   990461300.838633 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838665 RET   connect -1 errno 37
 31246 webbot   990461300.838680 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838694 RET   connect -1 errno 37
 31246 webbot   990461300.838708 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838722 RET   connect -1 errno 37
 31246 webbot   990461300.838736 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838752 RET   connect -1 errno 37
 31246 webbot   990461300.838766 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838781 RET   connect -1 errno 37
 31246 webbot   990461300.838795 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838810 RET   connect -1 errno 37
 31246 webbot   990461300.838824 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.838839 RET   connect -1 errno 37
[...]
 31246 webbot   990461300.941052 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.941066 RET   connect -1 errno 37
 31246 webbot   990461300.941215 CALL  connect(0x10,0x10186c,0x10)
 31246 webbot   990461300.941242 RET   connect -1 errno 56
 31246 webbot   990461300.942225 CALL  gettimeofday(0xdfbfd0c0,0)
 31246 webbot   990461300.942260 RET   gettimeofday 0
 31246 webbot   990461300.942225 CALL  gettimeofday(0xdfbfd0c0,0)
 31246 webbot   990461300.942260 RET   gettimeofday 0
 31246 webbot   990461300.942276 CALL  write(0x10,0x110000,0xf5)
 31246 webbot   990461300.942351 GIO   fd 16 wrote 245 bytes
 31246 webbot   990461300.942395 RET   write 0xf5
 31246 webbot   990461300.942422 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461300.942443 RET   read -1 errno 35
 31246 webbot   990461300.942467 CALL  gettimeofday(0xdfbfd040,0)
 31246 webbot   990461300.942484 RET   gettimeofday 0
 31246 webbot   990461300.942513 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461300.942532 RET   read -1 errno 35
 31246 webbot   990461300.942547 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461300.942562 RET   read -1 errno 35
 31246 webbot   990461300.942576 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461300.942591 RET   read -1 errno 35
 31246 webbot   990461300.942606 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461300.942621 RET   read -1 errno 35
 31246 webbot   990461300.942635 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461300.942650 RET   read -1 errno 35
 31246 webbot   990461300.942664 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461300.942680 RET   read -1 errno 35
 31246 webbot   990461300.942694 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461300.942710 RET   read -1 errno 35
[...]
 31246 webbot   990461301.053835 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461301.053850 RET   read -1 errno 35
 31246 webbot   990461301.054151 CALL  read(0x10,0x107018,0x8000)
 31246 webbot   990461301.054207 GIO   fd 16 read 131 bytes
 31246 webbot   990461301.054226 RET   read 0x83
 31246 webbot   990461301.054465 CALL  connect(0xf,0xe936c,0x10)
 31246 webbot   990461301.054501 RET   connect -1 errno 56

I do not understand, why the EventLoop does not go back to call
select(2).  Looking at the -current CVS respository, it does
not seem that any of the changes might have fixed these bugs.

Any ideas?

Regards,
 Niels Provos.

Received on Tuesday, 22 May 2001 04:23:43 UTC