- From: Soumen Chakrabarti <soumen@cse.iitb.ernet.in>
- Date: Mon, 1 Jan 2001 16:23:41 +0530 (IST)
- To: www-lib@w3.org
We wish to write a "robot" which does not extract links to follow from
crawled pages, but is given a batch of URLs to fetch (sort of a batch-mode
bulk-fetch routine).
We tried using the non-blocking robot profile, added proxy support, and
issued requests for several URLs at once. The client was a byte-count
stream. The after-handler gets fired right away with status codes of -1
for all pages.
Would appreciate some guidance with this, thanks. Code follows:
#include "WWWLib.h" /* Global Library Include
file */
#include "WWWMIME.h" /* MIME
parser/generator */
#include "WWWNews.h" /* News access
module */
#include "WWWHTTP.h" /* HTTP access
module */
#include "WWWFTP.h"
#include "WWWFile.h"
#include "WWWGophe.h"
#include "WWWInit.h"
#define APP_NAME "GETTOOL"
#define APP_VERSION "1.0"
#define DEFAULT_OUTPUT_FILE "get.out"
PRIVATE int printer (const char * fmt, va_list pArgs)
{
return (vfprintf(stdout, fmt, pArgs));
}
PRIVATE int tracer (const char * fmt, va_list pArgs)
{
return (vfprintf(stderr, fmt, pArgs));
}
class Context {
public:
int * premain;
HTStream * stream;
};
/*
** We get called here from the event loop when we are done
** loading. Here we terminate the program as we have nothing
** better to do.
*/
int terminate_handler (HTRequest * request, HTResponse * response,
void * param, int status)
{
Context * context = (Context*) HTRequest_context(request);
// Check for status
HTPrint("Load resulted in status=%d context=%x\n", status, context);
// We are done with this request
HT_FREE(context);
HTRequest_delete(request);
return 0;
}
int main (int argc, char ** argv)
{
char * urlSet[] = {
"http://www.ibm.com",
"http://www.cs.princeton.edu",
"http://www.cs.washington.edu",
"http://www.cs.berkeley.edu",
"http://www-cs.stanford.edu",
"http://www.blues-link.com",
"http://www.findit.co.uk",
"http://www.noad.com/",
"http://www-leland.stanford.edu/~talarian/guitar.html",
"http://www.travel-asia.com/dailies/techno.htm",
"http://www.pacificrim.net/"
};
char * urlNewSet[] = {
"http://www.cse.iitb.ernet.in/"
};
HTProfile_newRobot("Memex", "0");
HTAAInit();
HTAlertInit();
HTProxy_add("http", "http://surya.cse.iitb.ernet.in:80/");
/* Need our own trace and print functions */
HTPrint_setCallback(printer);
HTTrace_setCallback(tracer);
/* Add our own filter to terminate the application */
HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
/* Set the timeout for how long we are going to wait for a response */
HTHost_setEventTimeout(20000);
char fname[] = "wgotDDD.html";
int remain = sizeof(urlSet)/sizeof(char*);
for ( int ux = 0; ux < sizeof(urlSet)/sizeof(char*); ux++ ) {
sprintf(fname, "wgot%d.html", ux);
const char * url = urlSet[ux];
HTRequest * request = HTRequest_new();
HTRequest_setOutputFormat (request, WWW_RAW);
HTStream * getLength = HTContentCounter(HTBlackHole(), request, -1);
Context * ctx = (Context*) HT_CALLOC(1, sizeof(Context));
ctx->premain = &remain;
ctx->stream = getLength;
HTRequest_setContext(request, ctx);
BOOL status = HTLoadToStream(url, getLength, request);
HTPrint("%d %s\n", status, url);
}
HTEventList_newLoop();
return 0;
}
Received on Monday, 1 January 2001 06:00:46 UTC