更新:我使用@Dmitry的答案作为指南将代码重构到一个要旨中。更新要简单得多,实现了IDisposable
,并且缩短了大约30行。
我周末写这篇文章是为了好玩,我正在寻找批评。欢迎评论风格和可读性,但我真正需要知道的是:
当我问自己这些问题时,我得到1=是,2=不,3= maaaaaybe。我想添加其他特性,比如跳过标题行、推断数据类型、验证字段计数等等,但是我将通过派生或扩展来解决这类问题,因为如果基于这样一个现有的IEnumerable<IEnumerable<>>
,这种逻辑的实现会更简单。
FLAME ON;
foreach (var row in DelimitedReader.Create(fileName)) {
foreach (var field in row) {
// do stuff
}
}
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace ByteTerrace
{
public class DelimitedReader : IEnumerable<IEnumerable<string>>
{
private const int DEFAULT_CHUNK_SIZE = 128;
private const char DEFAULT_ESCAPE_CHAR = '"';
private const char DEFAULT_SEPARATOR_CHAR = ',';
private readonly char[] m_buffer;
private readonly Encoding m_encoding;
private readonly char m_escapeChar;
private readonly string m_fileName;
private readonly char m_separatorChar;
public char[] Buffer {
get {
return m_buffer;
}
}
public Encoding Encoding {
get {
return m_encoding;
}
}
public char EscapeChar {
get {
return m_escapeChar;
}
}
public string FileName {
get {
return m_fileName;
}
}
public char SeparatorChar {
get {
return m_separatorChar;
}
}
public DelimitedReader(string fileName, char separatorChar = DEFAULT_SEPARATOR_CHAR, char escapeChar = DEFAULT_ESCAPE_CHAR, Encoding encoding = null, int bufferSize = DEFAULT_CHUNK_SIZE) {
m_buffer = new char[bufferSize];
m_encoding = (encoding ?? Encoding.UTF8);
m_escapeChar = escapeChar;
m_fileName = fileName;
m_separatorChar = separatorChar;
}
public IEnumerator<IEnumerable<string>> GetEnumerator() {
return ReadFields().GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator() {
return GetEnumerator();
}
IEnumerable<IEnumerable<string>> ReadFields() {
return ReadFields(ReadAllChunks(FileName, Encoding, Buffer), SeparatorChar, EscapeChar);
}
public static DelimitedReader Create(string fileName, char separatorChar = DEFAULT_SEPARATOR_CHAR, char escapeChar = DEFAULT_ESCAPE_CHAR, Encoding encoding = null, int bufferSize = DEFAULT_CHUNK_SIZE) {
return new DelimitedReader(fileName, separatorChar, escapeChar, encoding, bufferSize);
}
public static IEnumerable<char[]> ReadAllChunks(TextReader reader, char[] buffer) {
var count = buffer.Length;
var numBytesRead = 0;
while ((numBytesRead = reader.ReadBlock(buffer, 0, count)) == count) {
yield return buffer;
}
if (numBytesRead > 0) {
Array.Resize(ref buffer, numBytesRead);
yield return buffer;
}
}
public static IEnumerable<char[]> ReadAllChunks(string fileName, Encoding encoding, char[] buffer) {
return ReadAllChunks(new StreamReader(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), encoding), buffer);
}
public static string ReadField(StringBuilder buffer, int offset, int position, char escapeChar) {
if (buffer[offset] == escapeChar) {
if (position - offset != 2) {
return buffer.ToString(offset + 1, position - offset - 3);
}
else {
return string.Empty;
}
}
else {
return buffer.ToString(offset, position - offset - 1);
}
}
public static IEnumerable<IEnumerable<string>> ReadFields(IEnumerable<char[]> chunks, char separatorChar = DEFAULT_SEPARATOR_CHAR, char escapeChar = DEFAULT_ESCAPE_CHAR) {
var buffer = new StringBuilder();
var fields = new List<string>();
var endOfBuffer = 0;
var escaping = false;
var offset = 0;
var position = 0;
var head0 = '\0';
var head1 = head0;
foreach (var chunk in chunks) {
buffer.Append(chunk, 0, chunk.Length);
endOfBuffer = buffer.Length;
while (position < endOfBuffer) {
head1 = head0;
if ((head0 = buffer[position++]) == escapeChar) {
escaping = !escaping;
if ((head0 == escapeChar) && (head1 == escapeChar)) {
endOfBuffer--;
position--;
buffer.Remove(position, 1);
}
}
if (!escaping) {
if ((head0 == '\n') || (head0 == '\r')) {
if ((head1 != '\r') || (head0 == '\r')) {
fields.Add(ReadField(buffer, offset, position, escapeChar));
yield return fields;
buffer.Remove(0, position);
endOfBuffer = buffer.Length;
fields.Clear();
offset = 0;
position = 0;
}
else {
offset++;
}
}
else if (head0 == separatorChar) {
fields.Add(ReadField(buffer, offset, position, escapeChar));
offset = position;
}
}
}
}
if (buffer.Length > 0) {
fields.Add(buffer.ToString());
}
if (fields.Count > 0) {
yield return fields;
}
}
}
}
发布于 2016-11-01 21:01:51
我希望尽可能多地依赖内置功能。我想相信,使用内置的东西可以使我的代码更易读,而且可能更快。
所以我的建议是:
public class DelimitedReader : IEnumerable<string[]>, IDisposable
{
private readonly StreamReader reader;
public DelimitedReader(string fileName, Encoding encoding = null)
: this(new StreamReader(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite),
encoding ?? Encoding.UTF8, encoding == null))
{
}
public DelimitedReader(StreamReader reader)
{
this.reader = reader;
}
public void Dispose()
{
reader.Dispose();
}
public char EscapeChar { get; set; } = '"';
public char SeparatorChar { get; set; } = ',';
private string[] ParseLine(string line)
{
List<string> fields = new List<string>();
char[] charsToSeek = { EscapeChar, SeparatorChar };
bool isEscaped = false;
int prevPos = 0;
while (prevPos < line.Length)
{
// If in the escaped mode, seek for the escape char only.
// Otherwise, seek for the both chars.
int nextPos = isEscaped
? line.IndexOf(EscapeChar, prevPos)
: line.IndexOfAny(charsToSeek, prevPos);
if (nextPos == -1)
{
// We reached the end of the line
if (!isEscaped)
{
// Add the rest of the line
fields.Add(line.Substring(prevPos, line.Length - prevPos).Trim());
break;
}
// If there is no closing escape char
throw new InvalidDataException("The following line has invalid format: " + line);
}
char nextChar = line[nextPos];
if (nextChar == EscapeChar)
{
// The next char is the escape char
if (isEscaped)
{
// If already in the escaped mode
fields.Add(line.Substring(prevPos, nextPos - prevPos)); // No Trim
}
isEscaped = !isEscaped; // Toggle mode
}
else
{
// The next char is the delimiter
fields.Add(line.Substring(prevPos, nextPos - prevPos).Trim()); // Trim
}
prevPos = nextPos + 1;
}
return fields.ToArray();
}
public IEnumerator<string[]> GetEnumerator()
{
while (!reader.EndOfStream)
{
yield return ParseLine(reader.ReadLine());
}
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
在上面的类中,我使用StreamReader.ReadLine
方法逐行读取文件,使用String.IndexOf
/String.IndexOfAny
方法在行中移动。
根据我的测试结果,这种方法更快一些。
发布于 2016-11-01 19:26:49
公共DelimitedReader(string fileName,char separatorChar = DEFAULT_SEPARATOR_CHAR,char escapeChar = DEFAULT_ESCAPE_CHAR,Encoding = null,int bufferSize = DEFAULT_CHUNK_SIZE)
这个构造函数非常大。如果您有那么多可选参数,那么将它们转换为属性。对于用户来说,更清楚的是,要创建一个有效的对象,他只需要指定一个参数。
Sill最好是一个全新的类型,将所有可选参数作为DelimitedReadProperties
保存。
您已经计划扩展这个类,所以将配置转移到一个专门的类中可能是一个很好的决定。
返回ReadAllChunks(新FileStream(fileName,FileMode.Open,FileAccess.Read,FileShare.Read,4096,FileOptions.SequentialScan),编码),缓冲区;
这条线的主要功能是可怕的。一切都嵌套在一个漫长的呼叫链中。
流应该被处理掉。
该类中的每个API都是静态的。你为什么要创建一个读者的实例呢?通过直接调用这些方法,可以在相同的结果和复杂度下使用。
https://codereview.stackexchange.com/questions/145860
复制相似问题