/* * Copyright (C) 2001 Ximian Inc. * * Authors: Michael Zucchi <notzed@helixcode.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public License * as published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "camel-mime-filter-html.h" #include <stdio.h> #include <string.h> #include <stdarg.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <stdlib.h> #include "xmlmemory.h" #include "HTMLparser.h" #include "HTMLtree.h" #define d(x) static void camel_mime_filter_html_class_init (CamelMimeFilterHTMLClass *klass); static void camel_mime_filter_html_init (CamelObject *o); static void camel_mime_filter_html_finalize (CamelObject *o); static CamelMimeFilterClass *camel_mime_filter_html_parent; struct _CamelMimeFilterHTMLPrivate { htmlParserCtxtPtr ctxt; }; /* ********************************************************************** */ /* HTML parser */ #define ARRAY_LEN(x) (sizeof(x)/sizeof((x)[0])) static struct { char *element; char *remap; } map_start[] = { { "p", "\n\n" }, { "br", "\n" }, { "h1", "\n" }, { "h2", "\n" }, { "h3", "\n" }, { "h4", "\n" }, { "h5", "\n" }, { "h6", "\n" }, }; static struct { char *element; char *remap; } map_end[] = { { "h1", "\n" }, { "h2", "\n" }, { "h3", "\n" }, { "h4", "\n" }, { "h5", "\n" }, { "h6", "\n" }, }; static void characters(void *ctx, const xmlChar *ch, int len) { CamelMimeFilter *mf = ctx; memcpy(mf->outptr, ch, len); mf->outptr+= len; } #if 0 /* we probably dont want to index comments */ static void comment(void *ctx, const xmlChar *value) { CamelMimeFilter *mf = ctx; mf->outptr += sprintf(mf->outptr, " %s \n", value); } #endif /* we map element starts to stuff sometimes, so we can properly break up words and lines. This is very dumb, and needs to be smarter: e.g. <b>F</b>\nooBar should -> "FooBar" */ static void startElement(void *ctx, const xmlChar *name, const xmlChar **atts) { int i; CamelMimeFilter *mf = ctx; /* we grab all "content" from "meta" tags, and dump it in the output, it might be useful for searching with. This should probably be pickier */ if (!strcasecmp(name, "meta")) { if (atts) { for (i=0;atts[i];i+=2) { if (!strcmp(atts[i], "content")) mf->outptr += sprintf(mf->outptr, " %s \n", atts[i+1]); } } return; } /* FIXME: use a hashtable */ for (i=0;i<ARRAY_LEN(map_start);i++) { if (!strcasecmp(map_start[i].element, name)) { characters(ctx, map_start[i].remap, strlen(map_start[i].remap)); break; } } } static void endElement(void *ctx, const xmlChar *name) { int i; /* FIXME: use a hashtable */ for (i=0;i<ARRAY_LEN(map_end);i++) { if (!strcasecmp(map_end[i].element, name)) { characters(ctx, map_end[i].remap, strlen(map_end[i].remap)); break; } } } /* dum de dum, well we can print out some crap for now */ static void warning(void *ctx, const char *msg, ...) { va_list args; va_start(args, msg); fprintf(stdout, "SAX.warning: "); vfprintf(stdout, msg, args); va_end(args); } static void error(void *ctx, const char *msg, ...) { va_list args; va_start(args, msg); fprintf(stdout, "SAX.error: "); vfprintf(stdout, msg, args); va_end(args); } static void fatalError(void *ctx, const char *msg, ...) { va_list args; va_start(args, msg); fprintf(stdout, "SAX.fatalError: "); vfprintf(stdout, msg, args); va_end(args); } static xmlSAXHandler indexSAXHandler = { NULL, /* internalSubset */ NULL, /*isStandalone,*/ NULL, /*hasInternalSubset,*/ NULL, /*hasExternalSubset,*/ NULL, /*resolveEntity,*/ NULL, /*getEntity,*/ NULL, /*entityDecl,*/ NULL, /*notationDecl,*/ NULL, /*attributeDecl,*/ NULL, /*elementDecl,*/ NULL, /*unparsedEntityDecl,*/ NULL, /*setDocumentLocator,*/ NULL, /*startDocument,*/ NULL, /*endDocument,*/ startElement, endElement, NULL, /*reference,*/ characters, NULL, /*ignorableWhitespace,*/ NULL, /*processingInstruction,*/ NULL, /*comment,*/ warning, error, fatalError, NULL, /*getParameterEntity,*/ }; /* ********************************************************************** */ CamelType camel_mime_filter_html_get_type (void) { static CamelType type = CAMEL_INVALID_TYPE; if (type == CAMEL_INVALID_TYPE) { type = camel_type_register (camel_mime_filter_get_type (), "CamelMimeFilterHTML", sizeof (CamelMimeFilterHTML), sizeof (CamelMimeFilterHTMLClass), (CamelObjectClassInitFunc) camel_mime_filter_html_class_init, NULL, (CamelObjectInitFunc) camel_mime_filter_html_init, (CamelObjectFinalizeFunc) camel_mime_filter_html_finalize); } return type; } static void camel_mime_filter_html_finalize(CamelObject *o) { CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)o; if (f->priv->ctxt) htmlFreeParserCtxt(f->priv->ctxt); } static void camel_mime_filter_html_init (CamelObject *o) { CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)o; f->priv = g_malloc0(sizeof(*f->priv)); } static void complete(CamelMimeFilter *mf, char *in, size_t len, size_t prespace, char **out, size_t *outlenptr, size_t *outprespace) { CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)mf; camel_mime_filter_set_size(mf, len*2+256, FALSE); mf->outptr = mf->outbuf; d(printf("converting html end:\n%.*s\n", (int)len, in)); if (f->priv->ctxt == NULL) { f->priv->ctxt = htmlCreatePushParserCtxt(&indexSAXHandler, f, in, len, "", 0); len = 0; } htmlParseChunk(f->priv->ctxt, in, len, 1); *out = mf->outbuf; *outlenptr = mf->outptr - mf->outbuf; *outprespace = mf->outbuf - mf->outreal; d(printf("converted html end:\n%.*s\n", (int)*outlenptr, *out)); } static void filter(CamelMimeFilter *mf, char *in, size_t len, size_t prespace, char **out, size_t *outlenptr, size_t *outprespace) { CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)mf; camel_mime_filter_set_size(mf, len*2+16, FALSE); mf->outptr = mf->outbuf; d(printf("converting html:\n%.*s\n", (int)len, in)); if (f->priv->ctxt == NULL) f->priv->ctxt = htmlCreatePushParserCtxt(&indexSAXHandler, f, in, len, "", 0); else htmlParseChunk(f->priv->ctxt, in, len, 0); *out = mf->outbuf; *outlenptr = mf->outptr - mf->outbuf; *outprespace = mf->outbuf - mf->outreal; d(printf("converted html:\n%.*s\n", (int)*outlenptr, *out)); } static void reset(CamelMimeFilter *mf) { CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)mf; if (f->priv->ctxt != NULL) { htmlFreeParserCtxt(f->priv->ctxt); f->priv->ctxt = NULL; } } static void camel_mime_filter_html_class_init (CamelMimeFilterHTMLClass *klass) { CamelMimeFilterClass *filter_class = (CamelMimeFilterClass *) klass; camel_mime_filter_html_parent = CAMEL_MIME_FILTER_CLASS (camel_type_get_global_classfuncs (camel_mime_filter_get_type ())); filter_class->reset = reset; filter_class->filter = filter; filter_class->complete = complete; } /** * camel_mime_filter_html_new: * * Create a new CamelMimeFilterHTML object. * * Return value: A new CamelMimeFilterHTML widget. **/ CamelMimeFilterHTML * camel_mime_filter_html_new (void) { CamelMimeFilterHTML *new = CAMEL_MIME_FILTER_HTML ( camel_object_new (camel_mime_filter_html_get_type ())); return new; }