Remove byte-order mark from UTF-8 strings

The byte-order marks are useless for UTF-8, but that doesn't mean we don't find them in the wild. Get rid of them, they confuse the hell out of the collation functions. Reported by Kai Elwert.
2011-04-30 18:52:33 +02:00 · 2011-04-30 18:52:33 +02:00 · 75dc4106a8
parent 055be880d4
commit 75dc4106a8
1 changed files with 10 additions and 1 deletions
--- a/src/misc.c
+++ b/src/misc.c
@ -423,7 +423,16 @@ unicode_fixup_string(char *str)

  /* String is valid UTF-8 */
  if (!u8_check((uint8_t *)str, len))
+    {
+      if (len >= 3)
+	{
+	  /* Check for and strip byte-order mark */
+	  if (memcmp("\xef\xbb\xbf", str, 3) == 0)
+	    memmove(str, str + 3, len - 3 + 1);
+	}
+
      return str;
+    }

  ret = u8_conv_from_encoding("ascii", iconveh_question_mark, str, len, NULL, NULL, &len);
  if (!ret)