[Prev][Next][Index][Thread]

Also looking for info how to get at HREFs within retrieved obj



Hello, similiar to the post of this morning, I am also looking for
details on how to best get at the HREF's within the retrieved HTML
file.  Specifically, after HTLoadAbsolute succeeds, how do I get
at any HREFs contained within?  Is there an object chain to traverse,
or does it require parsing through the html?  

Thanks.

-----------------------------------------------------------------------

/*
   findurl.c - Resolve URLs out on the World-Wide Web using the public
   domain libwww.  

   06oct95   giza@ultranet.com   Began.
*/

#include <stdio.h>
#include <stdlib.h>
#include <sysexits.h>

#include <WWWLib.h>
#include "findurl.h"

#define DASH '-'

void main(int argv, char **argc);
void show_usage(char *pname);
BOOLEAN resolve_url(char *url, char *fname);

/*
  Code to use W3C Reference library to resolve URLs. Note, do *not* explicitly
  close the file opened in this function.  The library does that for you.
  Returns TRUE upon success.
*/
BOOLEAN resolve_url(char *url, char *fname)
  {
  FILE *fp;
  BOOLEAN return_value = FALSE;
  HTRequest *request;

  if (DASH == *fname )
    fp = stdout;
  else
    fp = fopen(fname, "w");

  if ( (NULL != fp) && (NULL != (request = HTRequest_new())))
    {
    int i;
    request->method = HTMethod_enum("GET");
    request->output_format = WWW_SOURCE;
    request->BlockingIO = TRUE;
    request->output_stream = HTFWriter_new(fp, FALSE);

    switch((i=HTLoadAbsolute(url,request)))
      {
      case HT_LOADED:
        if (NULL != getenv("DEBUG"))
          printf("URI: [%s]\n", HTAnchor_address((HTAnchor*)request->anchor));
        return_value = TRUE;
        break;
  
      case HT_ERROR:
        fprintf(stderr, "HT_ERROR: Error has occured\n");
        break;

      default:
        fprintf(stderr, "HTLoadAbsolute returned error code %d\n", i);
        break;
      }

    HTRequest_delete(request);
    }
  return(return_value);
  }

void show_usage(char *pname)
  {
  fprintf(stderr,"Usage:\t%s URL-goes-here name-of-outfile-here\n",pname);
  fprintf(stderr,"if name-of-output-file is \"-\", then stdout is used\n");
  }

void main(int argc, char **argv)
  {
  char *url, *fname;

  /* Validate the command line. */
  if (3 != argc || (NULL==(url=argv[1])) || (NULL==(fname=argv[2])))
    {
    show_usage(argv[0]);
    exit(EX_USAGE);
    }

  /* Initialize the W3C Reference library. */
  if (NULL != getenv("DEBUG"))
    WWW_TraceFlag = 1;
  HTLibInit();

  /* Resovle the URL. */
  if (NULL == resolve_url(url, fname))
    fprintf(stderr, "Unable to resolve URL %s into file %s\n", url, fname);

  /* Close the W3C Reference Library. */
  HTLibTerminate();

  /* All done. */
  exit(EX_OK);
  }