我已经开了编程难题与代码高尔夫。这个问题还没有得到很好的理解,其中一个原因是没有提供参考执行。为了减轻这个缺点,我已经实现了一个参考解决方案(非高尔夫和文档)。
我想要你对
如果您不能检查所有内容,请集中注意IMHO可能需要注意的以下几点:
ParseHtmlToObjects(),这对我来说有点太长了AddProperty(),它有一个ref参数,并会产生副作用。如果可能的话,请建议一个更好的解决方案,最好是使用代码片段。我不想得到以下方面的反馈:
环境:
Convert()中存在“可能的多重枚举”。也许有人能详细说明这有多重要。任务:
有关更多细节,请参见PCG的问题。
守则的主要部分如下:
HtmlToJsonConverter()。Convert()法这里我不提供CustomJsonConverter,因为这很简单。
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.IO;
using System.Linq;
using ExCSS;
using HtmlAgilityPack;
using Newtonsoft.Json;
namespace QADirector
{
internal class HtmlToJsonConverter
{
private readonly FileInfo _inputFile;
private readonly FileInfo _outputFile;
/// <summary>
/// Creates an object that can convert a QA Director HTML export to a JSON object.
/// </summary>
/// <param name="source">HTML report to be converted.</param>
/// <param name="destination">JSON file name to be created</param>
public HtmlToJsonConverter(FileInfo source, FileInfo destination)
{
_outputFile = destination;
_inputFile = source;
}
/// <summary>
/// Converts the HTML input file into JSON and writes the output file
/// </summary>
public void Convert()
{
var htmlDocument = ReadFileToHtmlDocument();
var pages = GetPages(htmlDocument);
Debug.WriteLine("Found {0} pages", pages.Count());
var objects = ParseHtmlToObjects(pages);
WriteJson(objects);
}
/// <summary>
/// Write the object as JSON using a custom converter that
/// correctly writes the property list as single properties
/// and not as a list.
/// </summary>
/// <param name="obj">Object to be serialized</param>
private void WriteJson(DataObject obj)
{
var converter = new CustomJsonConverter();
var data = JsonConvert.SerializeObject(obj, Formatting.Indented, converter);
File.WriteAllText(_outputFile.FullName, data);
}
/// <summary>
/// Parses the HTML pages and creates an equivalent object by parsing the properties and values
/// as well as the properties and values of the sub items
/// </summary>
/// <param name="pages">HTML pages to be analyzed. <see cref="GetPages"/>.</param>
/// <returns>An object containing properties and child objects.</returns>
private static DataObject ParseHtmlToObjects(IEnumerable<HtmlNode> pages)
{
var rootObject = new DataObject();
var currentObject = rootObject;
// Go through all pages
foreach (var page in pages)
{
// Analyze all the span tags
var steps = page.Descendants().Where(x => (x.Name == "span")).ToList();
string key = null;
foreach (var step in steps)
{
if (!IsKey(step))
{
// If this is not a key, the key was detected before. Use it to populate the object
currentObject = AddProperty(currentObject, key, GetTextFromSpan(step));
key = null;
}
else
{
if (key != null)
{
// Special case: we detected a new key, although the old key has not been used as property yet
// This can happen for keys without value, so add it empty.
currentObject = AddProperty(currentObject, key, "");
}
key = GetKeyFromNode(step);
var level = GetIndentationFromNode(step);
if (level == currentObject.level)
{
// Still the same level.
// Remain at the same object in order to add more properties.
}
else if (level > currentObject.level)
{
// Decend to lower level: create a new child
var child = new DataObject {level = level, Parent = currentObject};
currentObject.Children.Add(child);
currentObject = child;
}
else
{
// Move up until level is equal
while (level < currentObject.level)
{
currentObject = currentObject.Parent;
}
}
}
}
}
return rootObject;
}
/// <summary>
/// Adds a property to an object.
/// If that property already exists, create a new object and add the property there
/// </summary>
/// <param name="obj">Object to add the property to</param>
/// <param name="key">Name of the property</param>
/// <param name="value">Value of the property</param>
/// <returns>The same object if the property did not exist yet, the new object if the property already existed</returns>
private static DataObject AddProperty(DataObject obj, string key, string value)
{
// Special case: <Span> which contains the page information. Skip it.
if (key == null) return obj;
if (obj.Properties.ContainsKey(key))
{
// This key was already assigned, so this must be a new object on the same level
var sibling = new DataObject {level = obj.level, Parent = obj.Parent};
obj.Parent.Children.Add(sibling);
obj = sibling;
}
obj.Properties.Add(key, value);
return obj;
}
/// <summary>
/// Gets the indentation from the HTML node.
/// Detection is done via the left CSS style of the HTML element.
/// </summary>
/// <param name="step">HTML node to analyze</param>
/// <returns>Indentation value, 0 if no left attribute was found</returns>
private static decimal GetIndentationFromNode(HtmlNode step)
{
decimal thisIndent=0;
var styleSheet = ExtractStyle(step);
foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
{
if (cssAttribute.Name != "left") continue;
var numberFormatInfo = new NumberFormatInfo {NumberDecimalSeparator = "."};
thisIndent = decimal.Parse(cssAttribute.Term.ToString().Replace("in", ""), numberFormatInfo);
Debug.WriteLine("Left attribute found:" + cssAttribute.Term);
}
return thisIndent;
}
/// <summary>
/// Check if the node contains a key and if so, return its text.
/// </summary>
/// <param name="span">HTML SPAN node to be analyzed</param>
/// <returns>Text of the key if key was detected, <c>null</c> otherwise.
/// Text is cleaned from HTML entities and has the trailing colon removed.</returns>
private static string GetKeyFromNode(HtmlNode span)
{
return IsKey(span) ? GetTextFromSpan(span).Trim(':') : null;
}
/// <summary>
/// Check if the node contains a key.
/// Detection is done via the <c>font-weight="bold"</c> CSS style of the HTML element.
/// </summary>
/// <param name="span">HTML SPAN node to be analyzed</param>
/// <returns><c>True</c> if font-weight was bold, <c>false</c> otherwise.</returns>
private static bool IsKey(HtmlNode span)
{
var styleSheet = ExtractStyle(span);
foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
{
if (cssAttribute.Name != "font-weight" || cssAttribute.Term.ToString() != "bold") continue;
return true;
}
return false;
}
/// <summary>
/// Gets the text from a HTML SPAN element.
/// This includes removal of the <c>NOBR</c> tag and decoding of HTML special characters.
/// </summary>
/// <param name="span">SPAN element to get the text from.</param>
/// <returns>Inner text of the SPAN node</returns>
private static string GetTextFromSpan(HtmlNode span)
{
var text = span.Descendants("nobr").First().InnerText; // There is only one <nobr>
text = System.Net.WebUtility.HtmlDecode(text);
return text;
}
/// <summary>
/// Extracts the information of the <c>style</c> attribute
/// </summary>
/// <param name="element">HTML element to extract the style information from</param>
/// <returns></returns>
private static StyleSheet ExtractStyle(HtmlNode element)
{
var rawStyle = element.Attributes["style"].Value;
var styleSheet = new Parser().Parse(String.Format(".dummy{{{0}}}", rawStyle));
return styleSheet;
}
/// <summary>
/// Gets the pages, identified by a DIV element defining the width and height
/// inside another DIV element
/// (the one with style="page-break-inside:avoid;page-break-after:always;")
/// </summary>
/// <param name="htmlDocument">HTML to analyze</param>
/// <returns>DIV nodes corresponding to pages, using the inner of the described DIVs (the one defining width and height)</returns>
private static IEnumerable<HtmlNode> GetPages(HtmlDocument htmlDocument)
{
var pages =
htmlDocument.DocumentNode.Descendants()
.Where(x =>(x.Name == "div" && x.Ancestors("div").Count() == 1));
return pages;
}
/// <summary>
/// Reads the file contents from disk and converts it into a HTML document
/// </summary>
/// <returns>HTML document as read from disk</returns>
private HtmlDocument ReadFileToHtmlDocument()
{
var source = File.ReadAllText(_inputFile.FullName);
var html = new HtmlDocument();
html.LoadHtml(source);
return html;
}
}
}发布于 2015-03-04 23:08:24
这是令人困惑的:
foreach ( styleSheet.StyleRules0.Declarations) { if (cssAttribute.Name !=“cssAttribute”))继续;返回true;}返回false;
我觉得这更清楚
foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
{
if (cssAttribute.Name == "font-weight" && cssAttribute.Term.ToString() == "bold")
{
return true;
}
}
return false;我建议使用Any,但正如您提到的,您更喜欢循环。
https://codereview.stackexchange.com/questions/83233
复制相似问题