首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
MCP广场
社区首页 >问答首页 >使用dask和熊猫导入csv数据时的解析错误

使用dask和熊猫导入csv数据时的解析错误
EN

Stack Overflow用户
提问于 2022-11-13 19:20:31
回答 1查看 28关注 0票数 0

我试图导入一个非常大的.csv文件,如下所示:

代码语言:javascript
运行
复制
import dask.dataframe as dd
import pandas as pd
#TO DO

dd_subf1_small = dd.read_csv('subf1_small.csv', dtype={'Unnamed: 0': 'float64','oecd_subfield':'object','paperid':'object'}, sep=None, engine = 'python').persist()

但我得到了以下错误:

代码语言:javascript
运行
复制
---------------------------------------------------------------------------
ParserError                               Traceback (most recent call last)
Cell In [1], line 5
      2 import pandas as pd
      3 #TO DO
----> 5 dd_subf1_small = dd.read_csv('subf1_small.csv', dtype={'Unnamed: 0': 'float64','oecd_subfield':'object','paperid':'object'}, sep=None, engine = 'python').persist()

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/base.py:288, in DaskMethodsMixin.persist(self, **kwargs)
    249 def persist(self, **kwargs):
    250     """Persist this dask collection into memory
    251 
    252     This turns a lazy Dask collection into a Dask collection with the same
   (...)
    286     dask.base.persist
    287     """
--> 288     (result,) = persist(self, traverse=False, **kwargs)
    289     return result

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/base.py:904, in persist(traverse, optimize_graph, scheduler, *args, **kwargs)
    901     keys.extend(a_keys)
    902     postpersists.append((rebuild, a_keys, state))
--> 904 results = schedule(dsk, keys, **kwargs)
    905 d = dict(zip(keys, results))
    906 results2 = [r({k: d[k] for k in ks}, *s) for r, ks, s in postpersists]

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/threaded.py:89, in get(dsk, keys, cache, num_workers, pool, **kwargs)
     86     elif isinstance(pool, multiprocessing.pool.Pool):
     87         pool = MultiprocessingPoolExecutor(pool)
---> 89 results = get_async(
     90     pool.submit,
     91     pool._max_workers,
     92     dsk,
     93     keys,
     94     cache=cache,
     95     get_id=_thread_get_id,
     96     pack_exception=pack_exception,
     97     **kwargs,
     98 )
    100 # Cleanup pools associated to dead threads
    101 with pools_lock:

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/local.py:511, in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
    509         _execute_task(task, data)  # Re-execute locally
    510     else:
--> 511         raise_exception(exc, tb)
    512 res, worker_id = loads(res_info)
    513 state["cache"][key] = res

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/local.py:319, in reraise(exc, tb)
    317 if exc.__traceback__ is not tb:
    318     raise exc.with_traceback(tb)
--> 319 raise exc

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/local.py:224, in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
    222 try:
    223     task, data = loads(task_info)
--> 224     result = _execute_task(task, data)
    225     id = get_id()
    226     result = dumps((result, id))

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/core.py:119, in _execute_task(arg, cache, dsk)
    115     func, args = arg[0], arg[1:]
    116     # Note: Don't assign the subtask results to a variable. numpy detects
    117     # temporaries by their reference count and can execute certain
    118     # operations in-place.
--> 119     return func(*(_execute_task(a, cache) for a in args))
    120 elif not ishashable(arg):
    121     return arg

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/optimization.py:990, in SubgraphCallable.__call__(self, *args)
    988 if not len(args) == len(self.inkeys):
    989     raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 990 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/core.py:149, in get(dsk, out, cache)
    147 for key in toposort(dsk):
    148     task = dsk[key]
--> 149     result = _execute_task(task, cache)
    150     cache[key] = result
    151 result = _execute_task(out, cache)

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/core.py:119, in _execute_task(arg, cache, dsk)
    115     func, args = arg[0], arg[1:]
    116     # Note: Don't assign the subtask results to a variable. numpy detects
    117     # temporaries by their reference count and can execute certain
    118     # operations in-place.
--> 119     return func(*(_execute_task(a, cache) for a in args))
    120 elif not ishashable(arg):
    121     return arg

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/dataframe/io/csv.py:129, in CSVFunctionWrapper.__call__(self, part)
    126         rest_kwargs["usecols"] = columns
    128 # Call `pandas_read_text`
--> 129 df = pandas_read_text(
    130     self.reader,
    131     block,
    132     self.header,
    133     rest_kwargs,
    134     self.dtypes,
    135     columns,
    136     write_header,
    137     self.enforce,
    138     path_info,
    139 )
    140 if project_after_read:
    141     return df[self.columns]

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/dask/dataframe/io/csv.py:182, in pandas_read_text(reader, b, header, kwargs, dtypes, columns, write_header, enforce, path)
    180 bio.write(b)
    181 bio.seek(0)
--> 182 df = reader(bio, **kwargs)
    183 if dtypes:
    184     coerce_dtypes(df, dtypes)

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/pandas/util/_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    305 if len(args) > num_allow_args:
    306     warnings.warn(
    307         msg.format(arguments=arguments),
    308         FutureWarning,
    309         stacklevel=stacklevel,
    310     )
--> 311 return func(*args, **kwargs)

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/pandas/io/parsers/readers.py:678, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    663 kwds_defaults = _refine_defaults_read(
    664     dialect,
    665     delimiter,
   (...)
    674     defaults={"delimiter": ","},
    675 )
    676 kwds.update(kwds_defaults)
--> 678 return _read(filepath_or_buffer, kwds)

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/pandas/io/parsers/readers.py:581, in _read(filepath_or_buffer, kwds)
    578     return parser
    580 with parser:
--> 581     return parser.read(nrows)

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1253, in TextFileReader.read(self, nrows)
   1251 nrows = validate_integer("nrows", nrows)
   1252 try:
-> 1253     index, columns, col_dict = self._engine.read(nrows)
   1254 except Exception:
   1255     self.close()

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py:270, in PythonParser.read(self, rows)
    267     indexnamerow = content[0]
    268     content = content[1:]
--> 270 alldata = self._rows_to_cols(content)
    271 data, columns = self._exclude_implicit_index(alldata)
    273 conv_data = self._convert_data(data)

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py:1013, in PythonParser._rows_to_cols(self, content)
   1007             reason = (
   1008                 "Error could possibly be due to quotes being "
   1009                 "ignored when a multi-char delimiter is used."
   1010             )
   1011             msg += ". " + reason
-> 1013         self._alert_malformed(msg, row_num + 1)
   1015 # see gh-13320
   1016 zipped_content = list(lib.to_object_array(content, min_width=col_len).T)

File ~/opt/anaconda3/envs/bocconi/lib/python3.8/site-packages/pandas/io/parsers/python_parser.py:739, in PythonParser._alert_malformed(self, msg, row_num)
    722 """
    723 Alert a user about a malformed row, depending on value of
    724 `self.on_bad_lines` enum.
   (...)
    736     even though we 0-index internally.
    737 """
    738 if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
--> 739     raise ParserError(msg)
    740 elif self.on_bad_lines == self.BadLineHandleMethod.WARN:
    741     base = f"Skipping line {row_num}: "

ParserError: Expected 3 fields in line 1811036, saw 5

实际上,我不知道数据是如何生成的,因为csv文件是36 to的,而且没有打开。我看到了另一个问题,在那里,erro正在通过header=None,而我并没有这样做。

如何避免上述错误?

谢谢!

EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2022-11-13 19:51:17

正如错误所述,CSV文件可能包含5个值的行,而不是3个。

你有两个选择:

  1. 找到了这些行,并将它们从文件中修复/删除。如果文件很大,这可能是一种挑战。
  2. 使用参数on_bad_lines="skip"让熊猫跳过它们并继续加载文件。

在这里了解更多关于on_bad_lines的信息:https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

另外,我注意到您正在使用sep=None。为什么?每一行中的值是否被“无”分隔?那也太没道理了。默认的(也是最常见的分隔符(也就是分隔符)是逗号(,))。在这里发布一个来自文件的3行的例子,这样我就可以帮助它了。

票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/74424037

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档