当使用cURL从internet上取出字符流时,何时将数据从多字节数据类型转换为单个字节字符数组?
我编写了一个程序here,它似乎在回调函数中使用了ASCII。
然而,我编写了另一个程序,它使用带有wchar_t数据类型的UTF-8,它似乎也能工作。虽然我的机器上的wchar_t类型是4字节,而char是1字节,但是datastream似乎没有区分这两种数据类型。
我猜想对这个程序有某种类型的转换是透明的,但我不知道(我认为在UTF-8中,ASCII字符仍然占用1字节的内存,但是当一个程序使用wchar_t数据类型时,系统会将常规的ascii字符与零转换为4字节,但程序员没有实现.)。
#include "multicurl.h"
#define MAX_WAIT_MSECS 5*1000 /* Wait max. 5 seconds */
/* The largest difference between the ASCII and UTF-8 variations of this program is that this callback function is now dealing with an array of wchar_t blocks rather than chars which are always 1 byte long, but it still works the same basic way. */
static size_t write_callback(wchar_t *ptr, size_t size, size_t nmemb, void *userdata){// cURL callback function [read in datastream to memory]
// This prototype is provided by cURL, with an argument at the end for our data structure.
// This function is repeatedly called by cURL until there is no more data in the data stream; *ptr [it is assumed cURL handles memory management for this pointer].
size_t realsize = nmemb * size;// The number of bytes in the datastream [there is no NULL char]
MemType *mem = (MemType *)userdata;
wchar_t *tmp = realloc(mem->memory, mem->size + realsize + sizeof(wchar_t) );// We add 1 wchar_t unit for the NULL character.
if (tmp == NULL){
printf("Not Enough Memory, realloc returned NULL.\n");
exit(EXIT_FAILURE);
}
mem->memory = tmp;
memcpy(&(mem->memory[ mem->size / sizeof(wchar_t) ]), ptr, realsize );// Starting at the last element copy in datastream [it overwrites the last element]
mem->size += realsize;// The actual size, in bytes, is realsize + ( 1 * sizeof(wchar_t) ), however realsize gives us the location of the last element.
mem->memory[ mem->size / sizeof(wchar_t) ] = 0;// The datastream doesn't include a NULL character, so we zeroize the last element.
// We overwrite the NULL character {the zeroized element} on the next callback iteration, if any.
return (size * nmemb);// cURL crosschecks the datastream with this return value.
}
void *SetUpCurlHandle(CURLM * mh, wchar_t *utf8_url, MemType *output){
// Take in a multi handle pointer address, a URL and a struct pointer address, set up the curl easy handle and add it to the multi handle.
/* Convert our UTF-8 URL string to a regular ASCII URL string. */
char* url = (char*) malloc ( wcslen( utf8_url ) + 1 );
wcstombs(url, utf8_url, wcslen( utf8_url ) * sizeof( wchar_t ) );
CURL *hnd = NULL;
output->memory = malloc( sizeof( wchar_t ) ); // Initialize the memory component of the structure.
output->size = 0; // Initialize the size component of the structure.
// Initialize the cURL handle.
hnd = curl_easy_init();
if(hnd){
// Setup the cURL options.
curl_easy_setopt(hnd, CURLOPT_BUFFERSIZE, 102400L);
curl_easy_setopt(hnd, CURLOPT_URL, url);// Set the request URL
curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1L);
curl_easy_setopt(hnd, CURLOPT_USERAGENT, "curl/7.80.0");
curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50L);
curl_easy_setopt(hnd, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
curl_easy_setopt(hnd, CURLOPT_FTP_SKIP_PASV_IP, 1L);
curl_easy_setopt(hnd, CURLOPT_TCP_KEEPALIVE, 1L);
curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, write_callback);// The callback function to write data to.
curl_easy_setopt(hnd, CURLOPT_WRITEDATA, (void *)output);// Send the address of the data struct to callback func.
//curl_easy_setopt(hnd, CURLOPT_VERBOSE, 1);
curl_multi_add_handle(mh, hnd);
}else{
output->memory[0] = '\0';
}
return NULL;// The output struct was passed by reference no need to return anything.
}
CURLM *SetUpMultiCurlHandle(){
curl_global_init(CURL_GLOBAL_ALL);
CURLM * mh = curl_multi_init();
return mh;
}
void *PerformMultiCurl(CURLM * mh)
/*Take in a preset multi handle, request data from the remote server asynchronously {it's assumed cURL is using threads transparent to the calling program}.
Remove the handles from memory.*/
{
CURLMsg *msg=NULL;
CURL *hnd = NULL;
CURLcode return_code = 0;
int still_running = 0;
int msgs_left = 0;
curl_multi_perform(mh, &still_running);// Perform the requests.
do {
int numfds=0;
int res = curl_multi_wait(mh, NULL, 0, MAX_WAIT_MSECS, &numfds);
if(res != CURLM_OK) {
fprintf(stderr, "error: curl_multi_wait() returned %d\n", res);
return NULL;
}
curl_multi_perform(mh, &still_running);
/* Without this loop the program will proceed to the next statement, most likely before the messages are retrieved from the server.
The easy handle requests are conducted asynchronously, but one multi handle request is obviously conducted sequentially (can use pthreads to make asynchronous multi requests).*/
} while(still_running);
/* This portion of the code will clean up and remove the handles from memory, you could change this to make them more persistent */
while ((msg = curl_multi_info_read(mh, &msgs_left))) {
if (msg->msg == CURLMSG_DONE) {
hnd = msg->easy_handle;
return_code = msg->data.result;
if(return_code!=CURLE_OK) {
fprintf(stderr, "CURL error code: %d\n", msg->data.result);
continue;
}
curl_multi_remove_handle(mh, hnd);
curl_easy_cleanup(hnd);
hnd = NULL;
}
else {
fprintf(stderr, "error: after curl_multi_info_read(), CURLMsg=%d\n", msg->msg);
}
}
curl_multi_cleanup(mh);
curl_global_cleanup();
return NULL;
}
这个程序的全部UTF-8变体可以找到here。
发布于 2022-01-10 17:16:41
正如你所期望的,它不起作用。libcurl无法知道函数期望一个wchar_t*
,而它应该期望一个char*
。
如果您检查MyOutputStruct1.memory[0]
,您会发现它不包含它应该包含的内容。例如,当请求https://stackoverflow.com时,它包含0x4f44213c。这显然是错误的,因为这远远超出了有效代码点的范围。这实际上是前四个代码点(<!DO
)被塞进一个wchar_t
(以LE顺序排列)。
似乎是因为第二个bug才起作用的。打印宽字符串时,需要使用%ls
,而不是%s
。
wprintf(L"Output:\n%s\n", MyOutputStruct1.memory);
应该是
printf("Output:\n%ls\n", MyOutputStruct1.memory);
// -or-
wprintf(L"Output:\n%ls\n", MyOutputStruct1.memory);
基本上,整个代码都需要一个char*
。指针的类型是wchar_t*
,但它在任何地方都被用作char*
。因此,这两个bug主要是在程序中“抵消掉”。(我没有看过,但我预计输入的长度不能被sizeof(wchar_t)
整除。)如果指针实际上被用作一个wchar_t*
(例如,如果已经检查了它的元素,或者它被传递给了w
函数),那么问题就很明显了。
发布于 2022-01-11 01:05:19
正如注释部分所述,所有这些真正需要的是一个UTF-8解析器。字符可以容纳UTF-8,但如果不将它们转换为其他数据类型,我们就无法轻松地单独处理每个字符,有些UTF-8字符大于1字节。所以我在libuf-8的帮助下写了一个解析器。
/* gcc unicode.c -o unicode -lutf-8
This program makes use of libutf-8.
http://www.whizkidtech.redprince.net/i18n/
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <locale.h>
#include <utf-8.h>
int* parse_UTF8_bitstream( size_t *len, const char *input_stream )
/* Parse a UTF-8 char bytestream into a 4-byte wide integer bytestream [so we can address each UTF-8 character individually] */
{
*len = 0; // This will give us the number of wide-characters not counting NULL.
int i = 0;
int n;
unsigned int *output = malloc ( sizeof( unsigned int ) );
unsigned int *temp;
while ( input_stream[ i ] ){
temp = ( unsigned int* ) realloc(output, (*len + 1) * sizeof( unsigned int ) );
output = temp;
output[ *len ] = sgetu8( (unsigned char *) &input_stream[ i ], &n );
i+= n; //Skip this many chars to the next UTF-8 code.
*len = *len + 1;
}
/* Make sure the last character is NULL */
temp = ( unsigned int* ) realloc(output, (*len + 1) * sizeof( unsigned int ) );
output = temp;
output[ *len ] = 0;
return (int*)output; // This is our wide character string.
}
void process_string(const char *s)
{
printf("%s\n", s);
printf("LENGTH: %lu 1-Byte Characters\n\n", strlen( s ));
size_t len;
int* outputstream = parse_UTF8_bitstream( &len, s );
printf("\n%ls\n", outputstream);
printf("LENGTH: %lu Wide Characters\n", len);
for(int i = 0; i<len; i++){
printf("%lc\n", outputstream[ i ]);
}
free ( outputstream );
}
int main(void)
/* "Hello World", in Arabic, Russian, and Greek */
{
setlocale(LC_ALL, "");
const char *string1 = "مرحبا بالعالم";
const char *string2 = "Всем привет";
const char *string3 = "Γεια σου κόσμε";
process_string( string1 );
process_string( string2 );
process_string( string3 );
return 0;
}
发布于 2022-01-11 18:00:47
这是我之前发布的程序,但是它不需要任何特殊的库。它使用stdlib中的mbtowc()函数。
来自mbtowc()手册页:
#包括 int (wchar_t*限制wcharp,const char *限制mbchar,size_t n字节);如果mbchar为NULL,则mbtowc()函数返回非零(如果支持移位状态,否则为零)。否则,如果mbchar不是空指针,mbtowc()要么返回0如果mbchar表示空宽字符,要么返回mbchar中处理的字节数,或者返回-1,如果无法识别或转换多字节字符。在这种情况下,mbtowc()的内部转换状态是未定义的。
/* cc unicode.c -o unicode */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <locale.h>
int parse_UTF8_bitstream(wchar_t **output_stream, const char *input_stream )
/* Parse a UTF-8 char bytestream into a 4-byte wide wchar_t bytestream
[so we can address each UTF-8 character individually]
If this parser receives invalid UTF-8 code it will return -1.
*/
{
int len = 0; /* This will give us the number of wide-characters not counting NULL. */
int i = 0; /* This iterates through the mb char stream. */
int skip_value;
int wc_size = sizeof( wchar_t ); /* The size of our destination datatype. */
/* Initialize the output_stream */
output_stream[ 0 ] = malloc ( 1 );
wchar_t *temp;
while ( input_stream[ i ] ){
temp = ( wchar_t* ) realloc( output_stream[ 0 ], (len + 1) * wc_size );
output_stream[ 0 ] = temp;
skip_value = mbtowc( &output_stream[ 0 ][ len ], &input_stream[ i ], wc_size );
if (skip_value == -1) return -1;
/* i skips this many chars to the next UTF-8 code. */
i += skip_value;
len = len + 1;
}
/* Make sure the last wide-character is NULL */
temp = ( wchar_t* ) realloc( output_stream[ 0 ], (len + 1) * wc_size );
output_stream[ 0 ] = temp;
output_stream[ 0 ][ len ] = 0;
return len; /* This is the length of the wide character string. */
}
void process_string(const char *s)
{
printf("\n%s\n", s);
printf("LENGTH: %lu 1-Byte Characters\n\n", strlen( s ));
wchar_t* outputstream = NULL;
size_t len = parse_UTF8_bitstream( &outputstream, s );
if( len == -1 ) {
printf("\nThe parser received invalid unicode.\n");
free ( outputstream );
exit ( EXIT_FAILURE );
}
printf("%ls\n", outputstream);
printf("LENGTH: %lu 4-Byte Wide-Characters\n", len);
for(int i = 0; i<len; i++){
printf("%lc\n", outputstream[ i ]);
}
free ( outputstream );
}
int main ( void )
/* "Hello World" in Arabic, Russian, Greek, Georgian,
Japanese, Chinese, and Korean.
I added an emoji string as an illustration [these
appear to be larger than 4 bytes each, they're probably
multiple unicode scalar values combined into a glyph].
For Asian and emoji characters to work you need appropriate fonts.
The noto-2.0 meta pkg on FreeBSD installs Asian and emoji fonts,
it's also available on Linux.
*/
{
setlocale(LC_ALL, "");
const char *string1 = "مرحبا بالعالم";
const char *string2 = "Всем привет";
const char *string3 = "Γεια σου κόσμε";
const char *string4 = "გამარჯობა სამყაროვ";
const char *string5 = "ハローワールド";
const char *string6 = "世界您好";
const char *string7 = "전 세계 여러분 안녕하세요";
const char *string8 = "️️️️";
process_string( string1 );
process_string( string2 );
process_string( string3 );
process_string( string4 );
process_string( string5 );
process_string( string6 );
process_string( string7 );
process_string( string8 );
return 0;
/* Don't use this code to violate anyone. */
}
https://stackoverflow.com/questions/70655879
复制相似问题