- From: Asaf Segal <asaf@cs.huji.ac.il>
- Date: Sun, 17 Dec 2000 17:23:28 +0200
- To: n_d <n_d@sina.com>
- CC: www-lib@w3.org
n_d wrote: > Hi > I'm a newbie to libwww. I have a problem in do HTTP GET > operations, what I wanna do just is to fetch a html page and > all links within the page. > > part of my code: > //start > HTProfile_newHTMLNoCacheClient("mytest", "1.0"); > HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST); > HText_registerLinkCallback(foundlink); > HTRequest_setOutputFormat(request, WWW_SOURCE); > ... > anchor = HTAnchor_findAddress(url); > chunk = HTLoadAnchorToChunk(anchor, request); > HTEventList_loop(request); > //end > > The trouble is I can't got both html page and links. > Upper code running result is I can get html page in variable "chunk" > But CANNOT get links -- function "foundlink" which I registered was > never called. > > If I got rid of "HTRequest_setOutputFormat(request, WWW_SOURCE)" > the result is just opposite. > > Can I get both html page and links? Any help will be appreciated! > > ______________________________________ > > =================================================================== > 新浪免费电子邮箱 http://mail.sina.com.cn > > 花样年华 为青春保鲜! > http://ad.doubleclick.net/clk;2219239;4732092;g?http://www.fa-ya.com here is a code that is based on the example showlinks. It has two problems that i'll be happy to get a solution for: The program doesn't stop: How can I know at the terminate handler that I there aren't any more request ? When I try to get an image the program prompts for file name. Hoe can I avoid it ? If I try HTRequest_setOutputFormat(new_request, WWW_SOURCE); I get: Interrupted! Fatal Error: Data transfer interrupted how can I read the file to a string without this message ? #include "WWWLib.h" #include "WWWInit.h" #include "WWWHTML.h" PRIVATE int printer (const char * fmt, va_list pArgs) { return (vfprintf(stdout, fmt, pArgs)); } PRIVATE int tracer (const char * fmt, va_list pArgs) { return (vfprintf(stderr, fmt, pArgs)); } PRIVATE int terminate_handler (HTRequest * request, HTResponse * response, void * param, int status) { /* We are done with this request */ HTRequest_delete(request); /* Terminate libwww */ // HTProfile_delete(); // exit(0); } PRIVATE void foundLink (HText * text, int element_number, int attribute_number, HTChildAnchor * anchor, const BOOL * present, const char ** value) { if (anchor) { /* ** Find out which link we got. The anchor we are passed is ** a child anchor of the anchor we are current parsing. We ** have to go from this child anchor to the actual destination. */ HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor); char * address = HTAnchor_address(dest); SGML_dtd * dtd = HTML_dtd(); HTTag * tag = SGML_findTag(dtd, element_number); HTPrint("Found link `%s\'", address); if (tag) { char * tagname = HTTag_name(tag); HTPrint(" in element: <%s>\n", tagname ? tagname : "<unknown>"); if ((strcmp(tagname, "FRAME") == 0) || (strcmp(tagname, "IMG") == 0)) { char *uri = HTParse(address, NULL, PARSE_ALL); HTRequest *new_request = HTRequest_new(); HTAnchor *new_anchor = HTAnchor_findAddress(uri); HTRequest_setAnchor(new_request, new_anchor); HTLoadAnchor(new_anchor, new_request); } } HT_FREE(address); } } int main (int argc, char ** argv) { char *uri = NULL; /* Create a new premptive client */ HTProfile_newHTMLNoCacheClient ("ReadPage", "1.0"); /* Need our own trace and print functions */ HTPrint_setCallback(printer); HTTrace_setCallback(tracer); /* Set trace messages and alert messages */ #if 0 HTSetTraceMessageMask("sop"); #endif /* Add our own termination filter */ HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST); /* ** Register our HTML link handler. We don't actually create a HText ** object as this is not needed. We only register the specific link ** callback. */ HText_registerLinkCallback(foundLink); /* Setup a timeout on the request for 15 secs */ HTHost_setEventTimeout(15000); /* Handle command line args */ if (argc >= 2) uri = HTParse(argv[1], NULL, PARSE_ALL); if (uri) { HTRequest * request = NULL; HTAnchor * anchor = NULL; BOOL status = NO; FILE * devnull = fopen("/dev/null", "w"); printf("Parsing %s\n", uri); /* Create a request */ request = HTRequest_new(); /* Get an anchor object for the URI */ anchor = HTAnchor_findAddress(uri); /* HTRequest_setOutputFormat(request, WWW_SOURCE); HTRequest_setOutputStream(request, HTFWriter_new(request, devnull, YES)); */ /* Issue the GET and store the result in a chunk */ status = HTLoadAnchor(anchor, request); /* Go into the event loop... */ if (status == YES) HTEventList_loop(request); } else { HTPrint("invalid address: %s\n", argv[1]); HTPrint("Type the URI to print out a list of embedded links\n"); HTPrint("\treadpage <uri>\n"); HTPrint("For example:\n"); HTPrint("\treadpage http://www.w3.org\n"); } return 0; }
Received on Sunday, 17 December 2000 10:23:31 UTC