RichEdit EM_STREAMIN CP_UTF8 nulls out some input characters



When I read a file encoded as UTF-8 into a RichEdit control, e.g.

SendMessage(hrichedit, EM_STREAMIN,
SF_TEXT | SF_USECODEPAGE | (CP_UTF8 << 16),
(LPARAM)(&editstream));

some of the characters from the input file are being replaced with nulls.

The first null usually occurs at offset 0xffd or 0xffe in the text buffer
(as retrieved by GetWindowText), and subsequent nulls occur after the same
interval, e.g. at 0x1ffb and 0x2ff9. Using a hex file viewer, I verified
that the input file was correct, with all normal ASCII characters (no
nulls), as originally input when the file was created. This occurs even on
files that contain only ASCII-7 characters, with nothing in them that
requires UTF-8 multi-character encoding. If I use Notepad to insert a UTF-8
byte-order character at the front of the file, the first null doesn't turn
up in the GetWindowText buffer until 0x1ff8.

This started occurring in a general-purpose text editor, so to isolate the
problem, I created a stripped-down program that did nothing except create
the window and read in a file. The problem still occurs.

I wonder if there are any known problems with UTF-8 streamin.

I am on XP with all high-priority updates applied as of right now.

In case anyone wants to look it over or try it out, I am including the test
program below. It has to be compiled with UNICODE defined. Here is the
command file I use to build it, using the Borland line-mode compiler:

bcc32 -c -6 -W -WU -Ie:\bcc55\include /Le:\bcc55\lib x.c
if ERRORLEVEL 1 goto :EOF
ilink32 /aa /Le:\bcc55\lib c0w32w.obj x.obj,x.exe,,import32.lib cw32i.lib
if ERRORLEVEL 1 goto :EOF
rem *** build complete ***

Thanks,
Allie

#include <wchar.h>
#include <windows.h>
#include <richedit.h>

long nothing(long l) {return l;}

HWND hmain;
HWND hedit;
int lenin;
WCHAR filename[80];
WCHAR msgbuild[100];
static MSG msg;
static HINSTANCE instance;

DWORD CALLBACK StreamIn(DWORD_PTR fd, LPBYTE buffer,
LONG count, LONG* recount) {
DWORD len;
*recount = 0;
if (!ReadFile((HANDLE)fd, buffer, count, &len, 0))
{ wsprintf(msgbuild, L"File Read Error %d", GetLastError());
MessageBox(hmain, msgbuild, L"debug", MB_OK);
return TRUE;
}
*recount = len;
lenin += len;
return FALSE;
}

static void reader(void) {
int len;
WCHAR* bufp;
WCHAR* p;
EDITSTREAM es;
HANDLE fd;
WPARAM stream;
SetWindowText(hedit, L"");
fd = CreateFile(filename, FILE_READ_DATA, 0, 0, OPEN_EXISTING, 0, 0);
if (fd == INVALID_HANDLE_VALUE)
{ MessageBox(0, L"CreateFile failed", L"debug", MB_OK);
return;
}
stream = SF_TEXT | SF_USECODEPAGE | (CP_UTF8 << 16);
lenin = 0;
es.dwCookie = (DWORD_PTR)fd;
es.dwError = FALSE;
es.pfnCallback = (EDITSTREAMCALLBACK)StreamIn;
SendMessage(hedit, EM_STREAMIN, stream, (LPARAM)(&es));

len = GetWindowTextLength(hedit) + 1;
p = bufp = (WCHAR*)HeapAlloc(GetProcessHeap(), 0, len * sizeof(WCHAR));
GetWindowText(hedit, bufp, len);
while (TRUE)
{ while (*p)
++p;
wsprintf(msgbuild, L"read %d %x window %d %x null %d %x",
lenin, lenin, len, len, p - bufp, p - bufp);
MessageBox(hmain, msgbuild, L"debug", MB_OK);
++p;
if (p - bufp > len / 2)
break;
}
HeapFree(GetProcessHeap(), 0, bufp);

CloseHandle(fd);
if (es.dwError)
MessageBox(0, L"es.dwError", L"debug", MB_OK);
}

LRESULT APIENTRY MainProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM lparam)
{
DWORD style;
switch (msg) {
case WM_DESTROY:
PostQuitMessage(0);
break;
case WM_CREATE:
style = WS_CHILD | WS_VSCROLL | ES_AUTOVSCROLL | ES_MULTILINE |
ES_NOHIDESEL;
hedit = CreateWindowEx(0, RICHEDIT_CLASS, L"",
style | WS_VISIBLE, 0, 0, 0, 0, hwnd, (HMENU)1, instance, 0);
if (!hedit)
{ wsprintf(msgbuild, L"hedit error %d", GetLastError());
MessageBox(0, msgbuild, L"debug", MB_OK);
}
break;
case WM_SIZE:
if (wparam == SIZE_MINIMIZED)
break;
MoveWindow(hedit, 0, 0, LOWORD(lparam), HIWORD(lparam), TRUE);
break;
case WM_SETFOCUS:
SetFocus(hedit);
break;
case WM_SYSCOMMAND:
default:
return DefWindowProc(hwnd, msg, wparam, lparam);
}
return 0;
}

int APIENTRY wWinMain(HINSTANCE hInst, HINSTANCE hdummy, LPCTSTR pdummy, int
idummy) {
WNDCLASS wc;
WCHAR* fnp;
wcscpy(filename, GetCommandLine());
fnp = wcschr(filename, ' ');
if (fnp)
{ while (*fnp == ' ')
++fnp;
if (*fnp == 0)
fnp = 0;
}
wcscpy(filename, fnp ? fnp : L"ed.c");
instance = hInst;
LoadLibrary(L"C:\\WINDOWS\\system32\\riched20.dll");
wc.cbClsExtra = 0;
wc.cbWndExtra = 0;
wc.hbrBackground = 0; // GetStockObject(LTGRAY_BRUSH);
wc.hInstance = hInst;
wc.hCursor = LoadCursor(NULL, IDC_ARROW);
wc.hIcon = NULL; // LoadIcon(NULL, IDI_APPLICATION);
wc.lpfnWndProc = (WNDPROC)MainProc;
wc.lpszClassName = L"xMain";
wc.lpszMenuName = 0;
wc.style = CS_HREDRAW | CS_VREDRAW;
if (!RegisterClass(&wc))
{ wsprintf(msgbuild, L"RegisterClass Error %d", GetLastError());
MessageBox(0, msgbuild, L"debug", MB_OK);
return 0;
}
hmain = CreateWindow(L"xMain", filename,
WS_OVERLAPPED | WS_CAPTION | WS_SYSMENU | WS_MINIMIZEBOX,
CW_USEDEFAULT, CW_USEDEFAULT, 770, 480, 0, 0, hInst, 0);
if (!hmain)
{ wsprintf(msgbuild, L"hmain error %d", GetLastError());
MessageBox(0, msgbuild, L"debug", MB_OK);
return 0;
}
ShowWindow(hmain, SW_SHOW);
UpdateWindow(hmain);
reader();
while(GetMessage(&msg, 0, 0, 0))
{ TranslateMessage(&msg);
DispatchMessage(&msg);
}
nothing((long)hdummy);
nothing((long)pdummy);
nothing((long)idummy);
return msg.wParam;
}


.



Relevant Pages

  • =?utf-8?B?UmU6IFN0cmluZyAiw6LigqzihKIiIHRyYW5zbGF0ZWQgdG8gYXBvc3Ryb3BoZS4gV2h5Pw==?=
    ... it works), though it seems to use mostly just Ascii characters, representing ... but the author is not making the best possible use of UTF-8. ... They don't map it to ASCII apostrophe, ... Latin 1 encoding. ...
    (alt.html)
  • [PATCH] UTF-8 input: composing non-latin1 characters, and copy-paste
    ... One can put the keyboard driver into Unicode mode, load a Unicode keymap, and get single keystrokes generate valid UTF-8 for non-ASCII characters. ...
    (Linux-Kernel)
  • Re: Attention: European C/C++/C#/Java Programmers-Call for Input
    ... For any language using a Latin ... Look at existing tools and source code that supports UTF-8, and see how it can make your work easier and give a result that users might actually be able to *use*. ... But you'll find something that does a reasonable job and *will* work perfectly for most programmers who stick to ASCII identifiers. ... A related problem is if you are making identifiers case-insensitive - it's hard to figure out cases for non-ASCII characters. ...
    (comp.arch.embedded)
  • Re: Special Characters in Query String
    ... I've had numerous problems with utf-8, ... in common characters in spanish not geting displayed. ... > available for encoding of characters. ... > If you can display your characters with ISO-8859-1, ...
    (microsoft.public.dotnet.framework.aspnet)
  • Re: Enhanced Unicode support for "Go" tools
    ... maybe Rene and Randy to note, perhaps - is an "ASCII compatible" ... version of UNICODE...in fact, for strict 7-bit ASCII, UTF-8 and ... characters so, being on Windows, that opinion makes great sense ... where the majority of the supported languages ...
    (alt.lang.asm)