W3C home > Mailing lists > Public > www-lib@w3.org > October to December 2000

Re: a simple question about HTTP GET

From: Asaf Segal <asaf@cs.huji.ac.il>
Date: Sun, 17 Dec 2000 17:23:28 +0200
Message-ID: <3A3CDA70.284CACDC@cs.huji.ac.il>
To: n_d <n_d@sina.com>
CC: www-lib@w3.org
n_d wrote:

> Hi
>    I'm a newbie to libwww. I have a problem in do HTTP GET
> operations, what I wanna do just is to fetch a html page and
> all links within the page.
>
> part of my code:
> //start
> HTProfile_newHTMLNoCacheClient("mytest", "1.0");
> HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
> HText_registerLinkCallback(foundlink);
> HTRequest_setOutputFormat(request, WWW_SOURCE);
> ...
> anchor = HTAnchor_findAddress(url);
> chunk = HTLoadAnchorToChunk(anchor, request);
> HTEventList_loop(request);
> //end
>
> The trouble is I can't got both html page and links.
> Upper code running result is I can get html page in variable "chunk"
> But CANNOT get links -- function "foundlink" which I registered was
> never called.
>
> If I got rid of "HTRequest_setOutputFormat(request, WWW_SOURCE)"
> the result is just opposite.
>
> Can I get both html page and links? Any help will be appreciated!
>
> ______________________________________
>
> ===================================================================
> 新浪免费电子邮箱 http://mail.sina.com.cn
>
> 花样年华  为青春保鲜!
> http://ad.doubleclick.net/clk;2219239;4732092;g?http://www.fa-ya.com

here is a code that is based on the example showlinks.
It has two problems that i'll be happy to get a solution for:
The program doesn't stop: How can I know at the terminate handler that I
there aren't any more request ?
When I try to get an image the program prompts for file name. Hoe can I
avoid it ?
If I try HTRequest_setOutputFormat(new_request, WWW_SOURCE); I get:
Interrupted!
Fatal Error: Data transfer interrupted how can I read the file to a
string without this message ?

#include "WWWLib.h"
#include "WWWInit.h"
#include "WWWHTML.h"

PRIVATE int printer (const char * fmt, va_list pArgs)
{
    return (vfprintf(stdout, fmt, pArgs));
}

PRIVATE int tracer (const char * fmt, va_list pArgs)
{
    return (vfprintf(stderr, fmt, pArgs));
}

PRIVATE int terminate_handler (HTRequest * request, HTResponse *
response,
          void * param, int status)
{
    /* We are done with this request */
    HTRequest_delete(request);

    /* Terminate libwww */
    //    HTProfile_delete();

    // exit(0);
}

PRIVATE void foundLink (HText * text,
   int element_number, int attribute_number,
   HTChildAnchor * anchor,
   const BOOL * present, const char ** value)
{
    if (anchor) {
 /*
 **  Find out which link we got. The anchor we are passed is
 **  a child anchor of the anchor we are current parsing. We
 **  have to go from this child anchor to the actual destination.
 */
 HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
 char * address = HTAnchor_address(dest);
 SGML_dtd * dtd = HTML_dtd();
 HTTag * tag = SGML_findTag(dtd, element_number);

 HTPrint("Found link `%s\'", address);
 if (tag) {
   char * tagname = HTTag_name(tag);
   HTPrint(" in element: <%s>\n",
    tagname ? tagname : "<unknown>");

   if ((strcmp(tagname, "FRAME") == 0) ||
       (strcmp(tagname, "IMG") == 0)) {
            char *uri = HTParse(address, NULL, PARSE_ALL);
     HTRequest *new_request = HTRequest_new();
     HTAnchor *new_anchor = HTAnchor_findAddress(uri);
     HTRequest_setAnchor(new_request, new_anchor);
     HTLoadAnchor(new_anchor, new_request);
   }
 }
 HT_FREE(address);
    }
}




int main (int argc, char ** argv)
{

    char *uri = NULL;

   /* Create a new premptive client */
    HTProfile_newHTMLNoCacheClient ("ReadPage", "1.0");

    /* Need our own trace and print functions */
    HTPrint_setCallback(printer);
    HTTrace_setCallback(tracer);

    /* Set trace messages and alert messages */
#if 0
    HTSetTraceMessageMask("sop");
#endif

    /* Add our own termination filter */
        HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL,
HT_FILTER_LAST);

    /*
    ** Register our HTML link handler. We don't actually create a HText
    ** object as this is not needed. We only register the specific link
    ** callback.
    */
    HText_registerLinkCallback(foundLink);

    /* Setup a timeout on the request for 15 secs */
    HTHost_setEventTimeout(15000);

    /* Handle command line args */
    if (argc >= 2)
 uri = HTParse(argv[1], NULL, PARSE_ALL);

    if (uri) {

 HTRequest * request = NULL;
 HTAnchor * anchor = NULL;
 BOOL status = NO;
 FILE * devnull = fopen("/dev/null", "w");

 printf("Parsing %s\n", uri);

 /* Create a request */
 request = HTRequest_new();

 /* Get an anchor object for the URI */
 anchor = HTAnchor_findAddress(uri);

 /*
 HTRequest_setOutputFormat(request, WWW_SOURCE);
 HTRequest_setOutputStream(request, HTFWriter_new(request, devnull,
YES));
 */

 /* Issue the GET and store the result in a chunk */
 status = HTLoadAnchor(anchor, request);

 /* Go into the event loop... */
 if (status == YES) HTEventList_loop(request);

    } else {
        HTPrint("invalid address: %s\n", argv[1]);
 HTPrint("Type the URI to print out a list of embedded links\n");
 HTPrint("\treadpage <uri>\n");
 HTPrint("For example:\n");
 HTPrint("\treadpage http://www.w3.org\n");
    }
    return 0;
}
Received on Sunday, 17 December 2000 10:23:31 GMT

This archive was generated by hypermail 2.2.0+W3C-0.50 : Monday, 23 April 2007 18:18:38 GMT