- From: Soumen Chakrabarti <soumen@cse.iitb.ernet.in>
- Date: Mon, 1 Jan 2001 16:23:41 +0530 (IST)
- To: www-lib@w3.org
We wish to write a "robot" which does not extract links to follow from crawled pages, but is given a batch of URLs to fetch (sort of a batch-mode bulk-fetch routine). We tried using the non-blocking robot profile, added proxy support, and issued requests for several URLs at once. The client was a byte-count stream. The after-handler gets fired right away with status codes of -1 for all pages. Would appreciate some guidance with this, thanks. Code follows: #include "WWWLib.h" /* Global Library Include file */ #include "WWWMIME.h" /* MIME parser/generator */ #include "WWWNews.h" /* News access module */ #include "WWWHTTP.h" /* HTTP access module */ #include "WWWFTP.h" #include "WWWFile.h" #include "WWWGophe.h" #include "WWWInit.h" #define APP_NAME "GETTOOL" #define APP_VERSION "1.0" #define DEFAULT_OUTPUT_FILE "get.out" PRIVATE int printer (const char * fmt, va_list pArgs) { return (vfprintf(stdout, fmt, pArgs)); } PRIVATE int tracer (const char * fmt, va_list pArgs) { return (vfprintf(stderr, fmt, pArgs)); } class Context { public: int * premain; HTStream * stream; }; /* ** We get called here from the event loop when we are done ** loading. Here we terminate the program as we have nothing ** better to do. */ int terminate_handler (HTRequest * request, HTResponse * response, void * param, int status) { Context * context = (Context*) HTRequest_context(request); // Check for status HTPrint("Load resulted in status=%d context=%x\n", status, context); // We are done with this request HT_FREE(context); HTRequest_delete(request); return 0; } int main (int argc, char ** argv) { char * urlSet[] = { "http://www.ibm.com", "http://www.cs.princeton.edu", "http://www.cs.washington.edu", "http://www.cs.berkeley.edu", "http://www-cs.stanford.edu", "http://www.blues-link.com", "http://www.findit.co.uk", "http://www.noad.com/", "http://www-leland.stanford.edu/~talarian/guitar.html", "http://www.travel-asia.com/dailies/techno.htm", "http://www.pacificrim.net/" }; char * urlNewSet[] = { "http://www.cse.iitb.ernet.in/" }; HTProfile_newRobot("Memex", "0"); HTAAInit(); HTAlertInit(); HTProxy_add("http", "http://surya.cse.iitb.ernet.in:80/"); /* Need our own trace and print functions */ HTPrint_setCallback(printer); HTTrace_setCallback(tracer); /* Add our own filter to terminate the application */ HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST); /* Set the timeout for how long we are going to wait for a response */ HTHost_setEventTimeout(20000); char fname[] = "wgotDDD.html"; int remain = sizeof(urlSet)/sizeof(char*); for ( int ux = 0; ux < sizeof(urlSet)/sizeof(char*); ux++ ) { sprintf(fname, "wgot%d.html", ux); const char * url = urlSet[ux]; HTRequest * request = HTRequest_new(); HTRequest_setOutputFormat (request, WWW_RAW); HTStream * getLength = HTContentCounter(HTBlackHole(), request, -1); Context * ctx = (Context*) HT_CALLOC(1, sizeof(Context)); ctx->premain = &remain; ctx->stream = getLength; HTRequest_setContext(request, ctx); BOOL status = HTLoadToStream(url, getLength, request); HTPrint("%d %s\n", status, url); } HTEventList_newLoop(); return 0; }
Received on Monday, 1 January 2001 06:00:46 UTC