changeset 278:d73ccb155878

Implement support for outputting CP850 and CP437 in our simple fallback character set convertor.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 06 Jan 2020 20:28:55 +0200
parents a23dae1a4849
children d5ab136cdc97
files sidinfo.c
diffstat 1 files changed, 123 insertions(+), 39 deletions(-) [+]
line wrap: on
line diff
--- a/sidinfo.c	Mon Jan 06 18:16:25 2020 +0200
+++ b/sidinfo.c	Mon Jan 06 20:28:55 2020 +0200
@@ -19,11 +19,6 @@
 // Some constants
 //
 
-// Default character encoding to convert to
-// NOTE! Do not change unless you are using iconv()!!
-// The fallback converter does not handle other encodings.
-#define SET_DEF_CHARSET      "utf8"
-
 // HVSC documents directory
 #define SET_HVSC_DOCUMENTS   "DOCUMENTS"
 
@@ -49,6 +44,15 @@
 };
 
 
+enum
+{
+    TH_LANG_UTF8,
+    TH_LANG_ISO88591,
+    TH_LANG_CP850,
+    TH_LANG_CP437,
+};
+
+
 typedef struct
 {
     int cmd;
@@ -130,9 +134,11 @@
 SIDLibSTILDB *sidSTILDB = NULL;
 
 
-BOOL    setUseChConv;
+BOOL    setUseOutConv;
 #ifdef HAVE_ICONV
-iconv_t setChConv;
+iconv_t setIConvCtx;
+#else
+int     setOutLang;
 #endif
 
 
@@ -382,6 +388,25 @@
 }
 
 
+static const uint8_t si_lang_iso88591_to_cp850[16*6] = {
+0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, 0xf9, 0xb8, 0xa6, 0xae, 0xaa, 0xf0, 0xa9, 0xee,
+0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, 0xf7, 0xfb, 0xa7, 0xaf, 0xac, 0xab, 0xf3, 0xa8,
+0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, 0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8,
+0xd1, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0x9e, 0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0xed, 0xe8, 0xe1,
+0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b,
+0xd0, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, 0x9b, 0x97, 0xa3, 0x96, 0x81, 0xec, 0xe7, 0x98,
+};
+
+static const uint8_t si_lang_iso88591_to_cp437[16*6] = {
+0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00,
+0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8,
+0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1,
+0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b,
+0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98,
+};
+
+
 char *siConvertCharset(const char *src)
 {
 #ifdef HAVE_ICONV
@@ -395,40 +420,81 @@
 
     while (srcLeft > 0)
     {
-        size_t ret = iconv(setChConv, &srcPtr, &srcLeft, &outPtr, &outLeft);
+        size_t ret = iconv(setIConvCtx, &srcPtr, &srcLeft, &outPtr, &outLeft);
         if (ret == (size_t) -1)
             break;
     }
 
 #else
-    // Fallback ISO-8859-1 to UTF-8 conversion
-    size_t srcSize = strlen(src),
-           outSize = srcSize * 2 + 1;
+    // Fallback conversion of ISO-8859-1 to X
+    size_t srcSize = strlen(src), outSize, minLeft;
     const uint8_t *srcPtr = (const uint8_t *) src;
+    const uint8_t *tab;
     uint8_t *outBuf, *outPtr;
-    if ((outBuf = outPtr = th_malloc(outSize + 1)) == NULL)
+
+    switch (setOutLang)
+    {
+        case TH_LANG_UTF8:
+            outSize = srcSize * 2;
+            minLeft = 2;
+            break;
+
+        default:
+            outSize = srcSize;
+            minLeft = 1;
+    }
+
+    if ((outBuf = outPtr = th_malloc(outSize)) == NULL)
         return NULL;
 
-    while (srcSize > 0 && outSize >= 2)
+    while (srcSize > 0 && outSize >= minLeft)
     {
-        if (*srcPtr < 0x80)
-        {
-            *outPtr++ = *srcPtr;
-            outSize--;
-        }
-        else
-        if (*srcPtr < 0xBF)
+        switch (setOutLang)
         {
-            *outPtr++ = 0xC2;
-            *outPtr++ = *srcPtr;
-            outSize -= 2;
+            case TH_LANG_UTF8:
+                // Not 100% correct really, but close enough
+                if (*srcPtr < 0x80)
+                {
+                    *outPtr++ = *srcPtr;
+                    outSize--;
+                }
+                else
+                if (*srcPtr < 0xBF)
+                {
+                    *outPtr++ = 0xC2;
+                    *outPtr++ = *srcPtr;
+                    outSize -= 2;
+                }
+                else
+                {
+                    *outPtr++ = 0xC3;
+                    *outPtr++ = *srcPtr - 0x40;
+                    outSize -= 2;
+                }
+                break;
+
+            case TH_LANG_ISO88591:
+                *outPtr++ = *srcPtr;
+                outSize--;
+                break;
+
+            case TH_LANG_CP850:
+            case TH_LANG_CP437:
+                // Not 100% correct either, but close enough
+                tab = (setOutLang == TH_LANG_CP850) ? si_lang_iso88591_to_cp850 : si_lang_iso88591_to_cp437;
+
+                if (*srcPtr < 0x7f)
+                    *outPtr++ = *srcPtr;
+                else
+                if (*srcPtr >= 0xA0)
+                    *outPtr++ = tab[*srcPtr - 0xA0];
+                else
+                    *outPtr++ = '?';
+
+                outSize--;
+                break;
         }
-        else
-        {
-            *outPtr++ = 0xC3;
-            *outPtr++ = (*srcPtr - 0x40);
-            outSize -= 2;
-        }
+
         srcPtr++;
         srcSize--;
     }
@@ -943,11 +1009,11 @@
 static void siPrintPSIDInfoLine(FILE *outFile, BOOL *shown,
     const char *fmt, const int otype,
     const char *d_str, const int d_int,
-    const BOOL useConv)
+    const BOOL convert)
 {
     char *str, *tmp;
 
-    if (setUseChConv && d_str != NULL && useConv)
+    if (setUseOutConv && d_str != NULL && convert)
     {
         char *tmp2 = siConvertCharset(d_str);
         tmp = siEscapeString(tmp2, optEscapeChars);
@@ -1318,15 +1384,33 @@
                 setLang[i++] = th_tolower(*ptr);
         }
         setLang[i] = 0;
-    }
 
 #ifdef HAVE_ICONV
-    // Initialize iconv, check if we have language/charset
-    setChConv = iconv_open(setLang != NULL ? setLang : SET_DEF_CHARSET, "iso88591");
-    setUseChConv = setChConv != (iconv_t) -1;
+        // Initialize iconv, check if we have language/charset
+        setIConvCtx = iconv_open("utf8", "iso88591");
+        setUseOutConv = setIConvCtx != (iconv_t) -1;
 #else
-    setUseChConv = setLang != NULL && strcmp(setLang, SET_DEF_CHARSET) == 0;
+        // Check if we can use our fallback converter
+        if (strcmp(setLang, "utf8") == 0)
+            setOutLang = TH_LANG_UTF8;
+        else
+        if (strcmp(setLang, "iso88591") == 0 ||
+            strcmp(setLang, "cp819") == 0 ||
+            strcmp(setLang, "latin1") == 0 ||
+            strcmp(setLang, "cp28591") == 0)
+            setOutLang = TH_LANG_ISO88591;
+        else
+        if (strcmp(setLang, "cp850") == 0)
+            setOutLang = TH_LANG_CP850;
+        else
+        if (strcmp(setLang, "cp437") == 0)
+            setOutLang = TH_LANG_CP437;
+        else
+            setOutLang = TH_LANG_ISO88591;
+
+        setUseOutConv = setOutLang != TH_LANG_ISO88591;
 #endif
+    }
 
     // Parse command line arguments
     if (!th_args_process(argc, argv, optList, optListN,
@@ -1334,7 +1418,7 @@
         goto out;
 
     THMSG(2, "Requested output LANG='%s', use charset conversion=%s\n",
-        setLang, setUseChConv ? "yes" : "no");
+        setLang, setUseOutConv ? "yes" : "no");
 
     if (optOneLineFieldSep != NULL)
     {
@@ -1483,8 +1567,8 @@
 out:
 
 #ifdef HAVE_ICONV
-    if (setUseChConv)
-        iconv_close(setChConv);
+    if (setUseOutConv)
+        iconv_close(setIConvCtx);
 #endif
 
     th_free(setLang);