nmh-workers
[Top] [All Lists]

Re: [Nmh-workers] scan or show of UTF-encoded headers?

2005-02-22 11:06:48
On 14 Feb, I wrote:
It's probably easier to hack the C code. I've had a quick go at
producing something which uses iconv to convert stuff to the native
character set (patch is below). Would be good if you could try this out
and look for ways to improve it.

I've now produced something good enough that I'll put it in CVS unless
someone complains first. This now includes configure tests for finding
iconv and a feature to put a question mark in place of any characters
iconv failed to convert. I've not been able to test the configure
changes on many systems so it would be good if you could have a go at
compiling this on any systems you have access to.

If you try this out from a UTF-8 locale, you're likely to notice that
nmh can't yet handle multibyte characters when truncating/padding
strings to fit a particular width. I've put fixing that on my todo list.

Oliver

Index: configure.in
===================================================================
RCS file: /cvsroot/nmh/nmh/configure.in,v
retrieving revision 1.66
diff -u -r1.66 configure.in
--- configure.in        27 Jan 2005 16:26:24 -0000      1.66
+++ configure.in        22 Feb 2005 17:47:30 -0000
@@ -445,7 +445,7 @@
 AC_CHECK_HEADERS(string.h memory.h stdlib.h unistd.h errno.h fcntl.h \
                  limits.h crypt.h termcap.h termio.h termios.h locale.h \
                  langinfo.h netdb.h sys/param.h sys/time.h sys/utsname.h \
-                 arpa/inet.h arpa/ftp.h)
+                 iconv.h arpa/inet.h arpa/ftp.h)
 
 
 AC_CACHE_CHECK(POSIX termios, nmh_cv_sys_posix_termios,
@@ -547,6 +547,46 @@
 done
 AC_SUBST(TERMLIB)dnl
 
+dnl ---------------
+dnl CHECK FOR ICONV
+dnl ---------------
+
+dnl Find iconv. It may be in libiconv and may be iconv() or libiconv()
+if test "x$ac_cv_header_iconv_h" = "xyes"; then
+  AC_CHECK_FUNC(iconv, ac_found_iconv=yes, ac_found_iconv=no)
+  if test "x$ac_found_iconv" = "xno"; then
+    AC_CHECK_LIB(iconv, iconv, ac_found_iconv=yes)
+    if test "x$ac_found_iconv" = "xno"; then
+      AC_CHECK_LIB(iconv, libiconv, ac_found_iconv=yes)
+    fi
+    if test "x$ac_found_iconv" != "xno"; then
+      LIBS="-liconv $LIBS"
+    fi
+  fi
+fi
+if test "x$ac_found_iconv" = xyes; then
+  AC_DEFINE(HAVE_ICONV, 1, [Define if you have the iconv() function.])
+fi
+
+dnl Check if iconv uses const in prototype declaration
+if test "x$ac_found_iconv" = "xyes"; then
+  AC_CACHE_CHECK(for iconv declaration, ac_cv_iconv_const,
+    [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <stdlib.h>
+        #include <iconv.h>]],
+        [[#ifdef __cplusplus
+          "C"
+          #endif
+          #if defined(__STDC__) || defined(__cplusplus)
+          size_t iconv (iconv_t cd, char * *inbuf, size_t *inbytesleft, char * 
*outbuf, size_t *outbytesleft);
+          #else
+          size_t iconv();
+          #endif]])],
+      [ac_cv_iconv_const=],
+      [ac_cv_iconv_const=const])])
+  AC_DEFINE_UNQUOTED([ICONV_CONST], $ac_cv_iconv_const,
+    [Define as const if the declaration of iconv() needs const.])
+fi
+
 dnl --------------
 dnl CHECK FOR NDBM
 dnl --------------
Index: h/prototypes.h
===================================================================
RCS file: /cvsroot/nmh/nmh/h/prototypes.h,v
retrieving revision 1.9
diff -u -r1.9 prototypes.h
--- h/prototypes.h      27 Jan 2005 16:26:24 -0000      1.9
+++ h/prototypes.h      22 Feb 2005 17:47:30 -0000
@@ -61,6 +61,7 @@
 char **getans (char *, struct swit *);
 int getanswer (char *);
 char **getarguments (char *, int, char **, int);
+char *get_charset();
 char *getcpy (char *);
 char *getfolder(int);
 int lkclose(int, char*);
Index: sbr/fmt_rfc2047.c
===================================================================
RCS file: /cvsroot/nmh/nmh/sbr/fmt_rfc2047.c,v
retrieving revision 1.2
diff -u -r1.2 fmt_rfc2047.c
--- sbr/fmt_rfc2047.c   2 Jul 2002 22:09:14 -0000       1.2
+++ sbr/fmt_rfc2047.c   22 Feb 2005 17:47:30 -0000
@@ -10,6 +10,10 @@
  */
 
 #include <h/mh.h>
+#ifdef HAVE_ICONV
+#  include <iconv.h>
+#  include <errno.h>
+#endif
 
 static signed char hexindex[] = {
     -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
@@ -61,6 +65,12 @@
     int between_encodings = 0; /* are we between two encodings?          */
     int equals_pending = 0;    /* is there a '=' pending?                */
     int whitespace = 0;                /* how much whitespace between 
encodings? */
+#ifdef HAVE_ICONV
+    int use_iconv = 0;          /* are we converting encoding with iconv? */
+    iconv_t cd;
+    int fromutf8;
+    char *saveq, *convbuf;
+#endif
 
     if (!str)
        return 0;
@@ -73,6 +83,14 @@
        return 0;
 
     for (p = str, q = dst; *p; p++) {
+
+        /* reset iconv */
+#ifdef HAVE_ICONV
+        if (use_iconv) {
+           iconv_close(cd);
+           use_iconv = 0;
+        }
+#endif
        /*
         * If we had an '=' character pending from
         * last iteration, then add it first.
@@ -106,9 +124,20 @@
            if (!*pp)
                continue;
 
-           /* Check if character set is OK */
-           if (!check_charset(startofmime, pp - startofmime))
+           /* Check if character set can be handled natively */
+           if (!check_charset(startofmime, pp - startofmime)) {
+#ifdef HAVE_ICONV
+               /* .. it can't. We'll use iconv then. */
+               *pp = '\0';
+               cd = iconv_open(get_charset(), startofmime);
+               fromutf8 = !strcasecmp(startofmime, "UTF-8");
+               *pp = '?';
+                if (cd == (iconv_t)-1) continue;
+               use_iconv = 1;
+#else
                continue;
+#endif
+           }
 
            startofmime = pp + 1;
 
@@ -159,6 +188,14 @@
            if (between_encodings)
                q -= whitespace;
 
+#ifdef HAVE_ICONV
+           if (use_iconv) {
+               saveq = q;
+               if (!(q = convbuf = (char *)malloc(endofmime - startofmime)))
+                   continue;
+            }
+#endif
+
            /* Now decode the text */
            if (quoted_printable) {
                for (pp = startofmime; pp < endofmime; pp++) {
@@ -218,6 +255,35 @@
                }
            }
 
+#ifdef HAVE_ICONV
+            /* Convert to native character set */
+           if (use_iconv) {
+               size_t inbytes = q - convbuf;
+               size_t outbytes = BUFSIZ;
+               ICONV_CONST char *start = convbuf;
+               
+               while (inbytes) {
+                   if (iconv(cd, &start, &inbytes, &saveq, &outbytes) ==
+                           (size_t)-1) {
+                       if (errno != EILSEQ) break;
+                       /* character couldn't be converted. we output a `?'
+                        * and try to carry on which won't work if
+                        * either encoding was stateful */
+                       iconv (cd, 0, 0, &saveq, &outbytes);
+                       *saveq++ = '?';
+                        /* skip to next input character */
+                       if (fromutf8) {
+                           for (start++;(*start & 192) == 128;start++)
+                               inbytes--;
+                       } else
+                           start++, inbytes--;
+                   }
+               }
+               q = saveq;
+               free(convbuf);
+           }
+#endif
+           
            /*
             * Now that we are done decoding this particular
             * encoded word, advance string to trailing '='.
@@ -229,6 +295,9 @@
            whitespace = 0;             /* re-initialize amount of whitespace */
        }
     }
+#ifdef HAVE_ICONV
+    if (use_iconv) iconv_close(cd);
+#endif
 
     /* If an equals was pending at end of string, add it now. */
     if (equals_pending)
Index: sbr/fmt_scan.c
===================================================================
RCS file: /cvsroot/nmh/nmh/sbr/fmt_scan.c,v
retrieving revision 1.13
diff -u -r1.13 fmt_scan.c
--- sbr/fmt_scan.c      30 Sep 2003 19:55:12 -0000      1.13
+++ sbr/fmt_scan.c      22 Feb 2005 17:47:30 -0000
@@ -130,7 +130,7 @@
                                sp++;\
                        }\
                        while ((c = (unsigned char) *sp++) && --i >= 0 && cp < 
ep)\
-                               if (isgraph(c)) \
+                               if (!iscntrl(c) && !isspace(c)) \
                                    *cp++ = c;\
                                else {\
                                        while ((c = (unsigned char) *sp) && 
(iscntrl(c) || isspace(c)))\
@@ -148,7 +148,7 @@
                    while ((c = (unsigned char) *sp) && (iscntrl(c) || 
isspace(c)))\
                        sp++;\
                    while((c = (unsigned char) *sp++) && cp < ep)\
-                       if (isgraph(c)) \
+                       if (!iscntrl(c) && !isspace(c)) \
                            *cp++ = c;\
                        else {\
                            while ((c = (unsigned char) *sp) && (iscntrl(c) || 
isspace(c)))\


_______________________________________________
Nmh-workers mailing list
Nmh-workers(_at_)nongnu(_dot_)org
http://lists.nongnu.org/mailman/listinfo/nmh-workers

<Prev in Thread] Current Thread [Next in Thread>