我有一个C库,它从文件中读取二进制数据,转换它并将所有内容存储在一个大字符*中,以便将数据返回给任何调用它的东西。这在C中运行得很好,但是使用python/Cython,我在分配内存时遇到了问题。
图书馆的原型是:int readWrapper(struct options opt, char *lineOut);
我的pyx文件:
from libc.string cimport strcpy, memset
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
options opt
int readWrapper(options opt, char *lineOut);
def pyreader(file, date, debug=0):
import logging
cdef options options
# Get the filename
options.filename = <char *>malloc(len(file) * sizeof(char))
options.debug = debug
# Size of array
outSize = 50000
cdef char *line_output = <char *> malloc(outSize * sizeof(char))
memset(line_output, 1, outSize)
line_output[outSize] = 0
# Call reader
return_val = readWrapper(options, line_output)
# Create dataframe
from io import StringIO
data = StringIO(line_output.decode('UTF-8', 'strict'))
df = pd.read_csv(data, delim_whitespace=True, header=None)
# Free memory
free(line_output)
return df
只要line_output保持在outSize
的大小内,它就能正常工作。但是有些文件更大,所以我如何动态地做到这一点?
根据DavidW的建议编辑
读取器包装类似于:
int readWrapper(struct options opt, char **lineOut)
{
// Open file for reading
fp = fopen(opt.filename, "r");
// Check for valid fp
if (fp == NULL)
{
printf("file pointer is null, aborting\n");
return (EXIT_FAILURE);
}
// Allocate memory
int ARRAY_SIZE = 5000;
*lineOut = NULL;
char *outLine = malloc(ARRAY_SIZE * sizeof (char));
if (outLine == NULL)
{
fprintf(stderr, "Memory allocation failed!");
return(EXIT_FAILURE);
}
// Create line and multi lines object
char line[255];
int numWritten = 0;
int memIncrease = 10000;
while (fp != feof)
{
// Read part of file
reader(fp, opt, line);
size_t num2Write = strlen(line);
if (ARRAY_SIZE < (numWritten + num2Write + 1))
{ // Won't fit so enlarge outLine
ARRAY_SIZE += memIncrease;
outLine = realloc(outLine, (sizeof *outLine * ARRAY_SIZE));
if (outLine == NULL)
{
fprintf(stderr, "Memory re-allocation failed!");
return(EXIT_FAILURE);
}
sprintf(outLine + numWritten, "%s", line);
numWritten += num2Write;
}
} // data block loop
*lineOut = outLine;
if (fp != NULL)
{
fclose(fp);
}
return (EXIT_SUCCESS);
}
新的pyx文件:
from libc.string cimport strcpy, memset
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
options opt
int readWrapper(options opt, char *lineOut);
def pyreader(file, date, debug=0):
import logging
cdef options options
# Get the filename
options.filename = <char *>malloc(len(file) * sizeof(char))
options.debug = debug
cdef char *line_output = NULL
# Call reader
return_val = readWrapper(options, &line_output)
# Create dataframe
from io import StringIO
data = StringIO(line_output.decode('UTF-8', 'strict'))
df = pd.read_csv(data, delim_whitespace=True, header=None)
# Free memory
free(line_output)
free(options.filename)
return df
这现在很好,但是在包装器(C)和python (pyx)部分中使用任何printf
或fprintf(stdout,...)
语句都会导致
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>
BrokenPipeError: [Errno 32] Broken pipe
当使用python3 test.py | head
时。在没有头部的情况下,不显示错误。
最后,关于文件名及其分配的建议也不适用于我。在运行时使用options.filename = file
会在TypeError: expected bytes, str found
中产生结果,但会编译。有趣的是,只有在运行这样调用包装器的python代码时才会发生这种情况:python3 test.py | head
。没有管道和磁头,BrokenPipeError就不存在。因此,这不是什么大问题,但想了解是什么原因造成的。
在BrokenPipeError上搜索了一些之后进行编辑
这个BrokenPipeError问题发生在头部而不是尾部。关于这个“错误”的解释可以在这里找到:https://stackoverflow.com/a/30091579/2885280
解决方案pyx文件:
最后一个使用前面提到的readWrapper.c的reader.pyx文件。内存分配由C处理,清理(最后)由pyx代码处理。
from libc.stdlib cimport free
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
char *DAY;
options opt
int readWrapper(options opt, char **lineOut);
def pyreader(file, date, debug=0):
import logging
import sys
import errno
import pandas as pd
# Init return valus
a = pd.DataFrame()
cdef options options
cdef char *line_output = NULL
# logging
logging.basicConfig(stream=sys.stdout,
format='%(asctime)s:%(process)d:%(filename)s:%(lineno)s:pyreader: %(message)s',
datefmt='%Y%m%d_%H.%M.%S',
level=logging.DEBUG if debug > 0 else logging.INFO)
try:
# Check inputs
if file is None:
raise Exception("No valid filename provided")
if date is None:
raise Exception("No valid date provided")
# Get the filename
file_enc = file.encode("ascii")
options.filename = file_enc
# Get date
day_enc = date.encode('ascii')
options.DAY = day_enc
try:
# Call reader
return_val = readWrapper(options, &line_output)
if (return_val > 0):
logging.error("pyreadASTERIX2 failed with exitcode {}".format(return_val))
return a
except Exception:
logging.exception("Error occurred")
free(line_output)
return a
from io import StringIO
try:
data = StringIO(line_output.decode('UTF-8', 'strict'))
logging.debug("return_val: {} and size: {}".format(return_val, len(line_output.decode('UTF-8', 'strict'))))
a = pd.read_csv(data, delim_whitespace=True, header=None, dtype={'id':str})
if a.empty:
logging.error("failed to load {} not enough data to construct DataFrame".format(file))
return a
logging.debug("converted data into pd")
except Exception as e:
logging.exception("Exception occured while loading: {} into DataFrame".format(file))
return a
finally:
free(line_output)
logging.debug("Size of df: {}".format(len(a)))
# Success, return DataFrame
return a
except Exception:
logging.exception("pyreader returned with an exception:")
return a
发布于 2020-08-20 09:12:13
你有两个基本的选择:
size =calculateSize(.)#例如,通过预读文件line_output = malloc(size) return_val = readWrapper(options,line_output)
readWrapper
负责分配内存。C中有两种常用的模式:返回指针(可能使用NULL
表示错误):
char* readWrapper(选项选择)
传递指针到指针并修改它。
// C int readWrapper(options opt,char* * str_out ) { //计算长度*str_out=malloc(长度);// etc }# Cython * line_out return_value = readWrapper(options,&line_out)
您需要确保您分配的所有字符串都已清理干净。您仍然有options.filename
的内存泄漏。对于options.filename
,您最好通过Cython获得一个指向file
内容的指针。这是有效的,只要file
存在,所以您不需要分配。
options.filename = file
只需确保options
不超过file
(也就是说,它不会存储在C中的任何地方供以后使用)。
总体而言
something = malloc(...)
try:
# code
finally:
free(something)
是确保清洁的好模式。
https://stackoverflow.com/questions/63489601
复制相似问题