[Bug 885250] [NEW] libc iconv does not reject surrogates when transcoding from UTF-32le to UTF-8

Launchpad Bug Tracker 885250 at bugs.launchpad.net
Wed Nov 2 14:28:37 UTC 2011


You have been subscribed to a public bug:

Compile and run the following program:

"""

#include <stdio.h>
#include <errno.h>
#include <iconv.h>

int main(int argc, char **argv) {
	iconv_t cd = iconv_open("UTF-8", "UTF-32LE");
	//iconv_t cd = iconv_open("UCS-2LE", "UCS-2LE");
	if (cd == (iconv_t)-1) {
		printf("Could not open: %d\n", errno);
		return 1;
	}
	
	//char in_buf[] = { 0xA1, 0xDC, 0xA5, 0xDC };
	//char in_buf[] = { 0xDC, 0xA1, 0xDC, 0xA5 };
	char in_buf[] = { 0xA1, 0xDC, 0x00, 0x00, 0xA5, 0xDC, 0x00, 0x00 };
	char out_buf[20];

	char *in_buf_p = in_buf;   size_t in_buf_left = sizeof(in_buf)/sizeof(char);
	char *out_buf_p = out_buf; size_t out_buf_left = 20;
	size_t conv_count = iconv(cd, &in_buf_p, &in_buf_left, &out_buf_p, &out_buf_left);

	if (conv_count == (size_t)-1) {
		switch (errno) {
		// Triggered by invalid multibyte sequence in input
		case EILSEQ:	printf("Conversion error: EILSEQ\n"); break;
		// Not enough space in output buffer
		case E2BIG:	printf("Conversion error: E2BIG\n"); break;
		// Incomplete multibyte sequence in input
		case EINVAL:	printf("Conversion error: EINVAL\n"); break;
		// Some other unknown error
		default:	printf("Conversion error: %d\n", errno);
		}
		return 2;
	}

	printf("Consumed %d, produced %d, converted %d\n", (in_buf_p-in_buf)/sizeof(char), (out_buf_p-out_buf)/sizeof(char), conv_count);
	for (char *out_buf_read = out_buf; out_buf_read < out_buf_p; out_buf_read++) {
		printf("\t%x\n", (unsigned char)*out_buf_read);
	}

	if (iconv_close(cd) != 0) {
		printf("Could not close: %d\n", errno);
		return 3;
	}

	return 0;
}
"""

Expected result:
"""
Conversion error: EILSEQ
"""

Actual result:
"""
Consumed 8, produced 6, converted 0
	ed
	b2
	a1
	ed
	b2
	a5
"""

This UTF-8 byte sequence is invalid according to the standard because it
encodes a surrogate code point.

Note that if you take this output byte sequence and run it through iconv
*again* (with both input and output encodings as UTF-8) then EILSEQ is
reported as expected.

** Affects: eglibc (Ubuntu)
     Importance: Undecided
         Status: New

-- 
libc iconv does not reject surrogates when transcoding from UTF-32le to UTF-8
https://bugs.launchpad.net/bugs/885250
You received this bug notification because you are a member of Ubuntu Foundations Bugs, which is subscribed to eglibc in Ubuntu.




More information about the foundations-bugs mailing list