[TRE-general] wchar backtracking glitches

Wolfgang Jenkner wjenkner at inode.at
Wed Nov 28 06:33:04 EET 2007


In [1] was pointed out that

	any match on a wide string that uses backtracking either fails
	completely, or only returns a partial match.

Since I've also stumbled across such cases I looked a bit into this
(see the first patch in the darcs bundle below).

I also coaxed the retest suite into providing a test suite for regwcomp,
regwexec and friends as well.  The second patch below  makes sure that
`make check' builds and runs the new `wretest'.  So I'd suggest doing
this first to see that a number of test cases (with backtracking)
actually do fail.  After applying the first patch there should be no
test failures, however.

Wolfgang

[1] http://laurikari.net/pipermail/tre-general/2007-January/000102.html

New patches:

[wchar buglets
Wolfgang Jenkner <wjenkner at inode.at>**20071127235041
 A typo prevented the correct definitions of the macros
 BT_STACK_WIDE_IN and BT_STACK_WIDE_OUT from being used.
 Pass BT_STACK_WIDE_IN an argument to get the correct expansion.
 
 (tre_tnfa_run_backtrack): Ignore the comment about ignoring multibyte
 characters and add a separate clause for STR_WIDE, using wmemcmp.
 Update str_wide.
] {
hunk ./lib/tre-match-backtrack.c 110
-#ifdef TRE_WHAR
-#define BT_STACK_WIDE_IN     stack->item.str_wide = (_str_wide)
-#define BT_STACK_WIDE_OUT    (_str_wide) = stack->item.str_wide
+#ifdef TRE_WCHAR
+#define BT_STACK_WIDE_IN(_str_wide)	stack->item.str_wide = (_str_wide)
+#define BT_STACK_WIDE_OUT		(str_wide) = stack->item.str_wide
hunk ./lib/tre-match-backtrack.c 114
-#define BT_STACK_WIDE_IN
+#define BT_STACK_WIDE_IN(_str_wide)
hunk ./lib/tre-match-backtrack.c 179
-      BT_STACK_WIDE_IN;							      \
+      BT_STACK_WIDE_IN(_str_wide);					      \
hunk ./lib/tre-match-backtrack.c 487
+#ifdef TRE_WCHAR
+	  else if (type == STR_WIDE)
+	    result = wmemcmp((wchar_t*)string + so, str_wide - 1, bt_len);
+#endif /* TRE_WCHAR */
hunk ./lib/tre-match-backtrack.c 492
-	    /* We can ignore multibyte characters here because the backref
-	       string is already aligned at character boundaries. */
hunk ./lib/tre-match-backtrack.c 511
+#ifdef TRE_WCHAR
+	      str_wide += bt_len - 1;
+#endif /* TRE_WCHAR */
}

[wretest - use the retest suite with regw*, too
Wolfgang Jenkner <wjenkner at inode.at>**20071127235950
 Add some code to translate the byte-oriented test cases in retest to
 wchar_t strings and offsets, then pass them to regwcomp and friends.
 This is done when the pre-processor symbol WRETEST is defined.  Change the
 build system accordingly, so that retest and the new wretest are built
 from the same file retest.c.
] {
hunk ./configure.ac 505
+  AM_CONDITIONAL(TRE_MULTIBYTE, true)
hunk ./configure.ac 508
+else
+  AM_CONDITIONAL(TRE_MULTIBYTE, false)
hunk ./tests/Makefile.am 10
+if TRE_MULTIBYTE
+check_PROGRAMS += wretest
+wretest_SOURCES = retest.c
+wretest_CPPFLAGS = -DWRETEST $(AM_CPPFLAGS)
+endif TRE_MULTIBYTE
hunk ./tests/Makefile.am 26
+if TRE_MULTIBYTE
+  wretest_LDADD = ../lib/libtre.la $(LDADD)
+endif TRE_MULTIBYTE
hunk ./tests/Makefile.am 54
+if TRE_MULTIBYTE
+  wretest_CFLAGS = -DMALLOC_DEBUGGING
+  wretest_LDADD = libxtre.la $(LDADD)
+endif TRE_MULTIBYTE
hunk ./tests/Makefile.am 67
+if TRE_MULTIBYTE
+TESTS += wretest
+endif TRE_MULTIBYTE
hunk ./tests/retest.c 53
+#ifdef WRETEST
+#include <wchar.h>
+#define CHAR_T wchar_t
+#define L(x) (L ## x)
+
+#define MAXSTRSIZE 1024
+static wchar_t wstr[MAXSTRSIZE];
+static wchar_t wregex[MAXSTRSIZE];
+static int woffs[MAXSTRSIZE];
+
+#define regexec regwexec
+#define regnexec regwnexec
+#define regcomp regwcomp
+#define regncomp regwncomp
+
+/* Iterate mbrtowc over the multi-byte sequence STR of length LEN,
+   store the result in BUF and memoize the successive byte offsets
+   in OFF.  */
+
+static int
+mbntowc (wchar_t *buf, const char *str, size_t len, int *off)
+{
+  mbstate_t cst;
+  int n, wlen;
+
+  if (len >= MAXSTRSIZE)
+    {
+      fprintf(stderr, "Increase MAXSTRSIZE to %d or more and recompile!\n",
+	len + 1);
+      exit(EXIT_FAILURE);
+    }
+
+  memset(&cst, 0, sizeof(cst));
+  if (off)
+    {
+      memset(off + 1, -1, len * sizeof(int));
+      *off = 0;
+    }
+
+  wlen = 0;
+  while (len > 0)
+    {
+      n = mbrtowc(buf ? buf++ : NULL, str, len, &cst);
+      if (n < 0)
+	return n;
+      if (n == 0)
+	n = 1;
+      str += n;
+      len -= n;
+      wlen += 1;
+      if (off)
+	*(off += n) = wlen;
+    }
+
+  return(wlen);
+}
+
+#else /* !WRETEST */
+#define CHAR_T char
+#define L(x) (x)
+#endif /* !WRETEST */
+
hunk ./tests/retest.c 120
-static char *regex_pattern;
+static CHAR_T *regex_pattern;
hunk ./tests/retest.c 149
-wrap_regexec(const char *data, size_t len,
+wrap_regexec(const CHAR_T *data, size_t len,
hunk ./tests/retest.c 152
-  char *buf = NULL;
+  CHAR_T *buf = NULL;
hunk ./tests/retest.c 165
-      buf = xmalloc(len + !use_regnexec);
+      buf = xmalloc((len + !use_regnexec) * sizeof(CHAR_T));
hunk ./tests/retest.c 168
-      memcpy(buf, data, len);
+      memcpy(buf, data, len * sizeof(CHAR_T));
hunk ./tests/retest.c 183
-      buf[len] = '\0';
+      buf[len] = L('\0');
hunk ./tests/retest.c 192
-wrap_regcomp(regex_t *preg, const char *data, size_t len, int cflags)
+wrap_regcomp(regex_t *preg, const CHAR_T *data, size_t len, int cflags)
hunk ./tests/retest.c 206
-execute(char *data, int len, size_t pmatch_len, regmatch_t *pmatch, int eflags)
+execute(CHAR_T *data, int len, size_t pmatch_len, regmatch_t *pmatch, int eflags)
hunk ./tests/retest.c 233
-check(va_list ap, int ret, char *str, size_t pmatch_len, regmatch_t *pmatch,
+check(va_list ap, int ret, CHAR_T *str, size_t pmatch_len, regmatch_t *pmatch,
hunk ./tests/retest.c 240
+#ifndef WRETEST
hunk ./tests/retest.c 244
+#else /* WRETEST */
+      printf("Exec error, regex: \"%ls\", cflags %d, "
+	     "string: \"%ls\", eflags %d\n", regex_pattern, cflags,
+	     str, eflags);
+#endif /* WRETEST */
hunk ./tests/retest.c 264
+#ifdef WRETEST
+	  if (rm_so >= 0)
+	    {
+	      int n = rm_so;
+
+	      if ((rm_so = woffs[rm_so]) < 0 ||
+		  (n = rm_eo, rm_eo = woffs[rm_eo]) < 0)
+		{
+		  printf("Invalid or incomplete multi-byte sequence "
+			 "in string %ls before byte offset %d\n", str, n);
+		  return 1;
+		}
+	    }
+#endif /* WRETEST */
hunk ./tests/retest.c 281
+#ifndef WRETEST
hunk ./tests/retest.c 286
+#else /* WRETEST */
+	      printf("Exec error, regex: \"%ls\", string: \"%ls\"\n",
+		     regex_pattern, str);
+	      printf("	group %d: expected (%d, %d) \"%.*ls\", "
+		     "got (%d, %d) \"%.*ls\"\n",
+#endif /* WRETEST */
hunk ./tests/retest.c 303
+#ifndef WRETEST
hunk ./tests/retest.c 305
+#else /* WRETEST */
+	  printf("Comp error, regex: \"%ls\"\n", regex_pattern);
+#endif /* WRETEST */
hunk ./tests/retest.c 317
+#ifndef WRETEST
hunk ./tests/retest.c 320
+#else /* WRETEST */
+	      printf("Exec error, regex: \"%ls\", string: \"%ls\"\n",
+		     regex_pattern, str);
+#endif /* WRETEST */
hunk ./tests/retest.c 349
+#ifdef WRETEST
+      {
+	int wlen = mbntowc(wstr, data, len, woffs);
+	if (wlen < 0)
+	  {
+	    exec_errors++;
+	    printf("Invalid or incomplete multi-byte sequence in %s\n", data);
+	    return;
+	  }
+	wstr[wlen] = L'\0';
+	len = wlen;
+      }
+#define data wstr
+#endif /* WRETEST */
+
hunk ./tests/retest.c 392
+#ifdef WRETEST
+#undef data
+#endif /* WRETEST */
+
hunk ./tests/retest.c 421
+#ifdef WRETEST
+      {
+	int wlen = mbntowc(wstr, str, len, woffs);
+	if (wlen < 0)
+	  {
+	    exec_errors++;
+	    printf("Invalid or incomplete multi-byte sequence in %s\n", str);
+	    return;
+	  }
+	wstr[wlen] = L'\0';
+	len = wlen;
+      }
+#define str wstr
+#endif /* WRETEST */
+
hunk ./tests/retest.c 465
+#ifdef WRETEST
+#undef str
+#endif /* WRETEST */
+
hunk ./tests/retest.c 479
-  regex_pattern = re;
-  cflags = flags;
-
-  comp_tests++;
hunk ./tests/retest.c 486
+  comp_tests++;
+
+#ifdef WRETEST
+  {
+    int wlen = mbntowc(wregex, re, len, NULL);
+
+    if (wlen < 0)
+      {
+	comp_errors++;
+	printf("Invalid or incomplete multi-byte sequence in %s\n", re);
+	return;
+      }
+    wregex[wlen] = L'\0';
+    len = wlen;
+  }
+#define re wregex
+#endif /* WRETEST */
+  regex_pattern = re;
+  cflags = flags;
+
hunk ./tests/retest.c 532
+#ifdef WRETEST
+#undef re
+#endif /* WRETEST */
+
hunk ./tests/retest.c 538
+#ifndef WRETEST
hunk ./tests/retest.c 540
+#else /* WRETEST */
+      printf("Comp error, regex: \"%ls\"\n", regex_pattern);
+#endif /* WRETEST */
hunk ./tests/retest.c 557
+#ifdef WRETEST
+  /* Need an 8-bit locale.  Or move the two tests with non-ascii
+     characters to the localized internationalization tests.  */
+  if (setlocale(LC_CTYPE, "fi_FI.ISO-8859-1") == NULL)
+    fprintf(stderr, "Could not set locale fi_FI.ISO-8859-1.  Expect some\n"
+		    "`Invalid or incomplete multi-byte sequence' errors.\n");
+#endif /* WRETEST */
}

Context:

[Fixed a bug in \<.
Ville Laurikari <vl at iki.fi>**20071104164756
 \< always matched at the beginning of the string.  Thanks to Shmuel
 Zeigerman for the bug report.
 
 See http://laurikari.net/pipermail/tre-general/2007-February/000128.html
] 
[Refactoring.
Ville Laurikari <vl at iki.fi>**20070316171802] 
[Fixed regoff_t documentation for wide characters.
Ville Laurikari <vl at iki.fi>**20070128190845
 The documentation erroneously claimed that offsets are always given in
 bytes (they are bytes in byte and multibyte strings, but wchar_t
 offsets in wchar_t strings).
 
 Thanks to Gregory Sharp for pointing this out.
] 
[TAG TRE 0.7.5
Ville Laurikari <vl at iki.fi>**20061210081147] 
Patch bundle hash:
80368e279d668f99138392793f79fd93b34efbc2




More information about the TRE-general mailing list