通过前面对复合文档结构的分析,解析起来问题就不大了。
01
解析Header结构
Header结构一定是在文件的最开始处,所以直接一个一个字段的读取就行了:
Private Type CFHeader
Signature(7) As Byte '文档标识id
CLSID(15) As Byte '文件唯一标识
MinorVersion As Integer '文件格式修订号
MajorVersion As Integer '文件格式版本号
ByteOrder As Integer 'FFFE表示 Little-Endian
SectorShift As Integer '扇区的大小 2的幂 通常为2^9=512
MiniSectorShift As Integer '短扇区大小,2的幂,通常为2^6
Reserved(5) As Byte
DirSectorsCount As Long '目录的数量
FATSectorsCount As Long '分区表扇区的总数
FirstDirSID As Long '目录流第一个扇区的ID
TransactionSignatureNumber As Long '
MiniStreamSize As Long '最小标准流
FirstMiniFATSID As Long '短分区表的第一个扇区ID
MiniFATSectorsCount As Long '短分区表扇区总数
FirstDIFATSID As Long '主分区表的第一个扇区ID
DIFATSectorsCount As Long '分区表的扇区总数
DIFATS(108) As Long '主分区表前109个记录
End Type
'读取文件头
Private Function parseCfHeader() As String
Dim iret As Long
iret = cf.r.Read(cf.Header.Signature)
If iret <> 8 Then
parseCfHeader = "复合文档:文件头id读取出错"
Exit Function
End If
Dim arr()
arr = Array(208, 207, 17, 224, 161, 177, 26, 225)
Dim i As Long
For i = 0 To 8 - 1
If cf.Header.Signature(i) <> arr(i) Then
parseCfHeader = "复合文档:文件头id出错"
Exit Function
End If
Next
cf.r.Read cf.Header.CLSID '文件唯一标识
cf.Header.MinorVersion = cf.r.ReadInteger '文件格式修订号
cf.Header.MajorVersion = cf.r.ReadInteger '文件格式版本号
cf.Header.ByteOrder = cf.r.ReadInteger 'FFFE表示 Little-Endian
cf.Header.SectorShift = cf.r.ReadInteger '扇区的大小 2的幂 通常为2^9=512
cf.Header.MiniSectorShift = cf.r.ReadInteger '短扇区大小,2的幂,通常为2^6
cf.r.Read cf.Header.Reserved
cf.Header.DirSectorsCount = cf.r.ReadLong()
cf.Header.FATSectorsCount = cf.r.ReadLong '分区表扇区的总数
cf.Header.FirstDirSID = cf.r.ReadLong '目录流第一个扇区的ID
cf.Header.TransactionSignatureNumber = cf.r.ReadLong()
cf.Header.MiniStreamSize = cf.r.ReadLong '最小标准流
cf.Header.FirstMiniFATSID = cf.r.ReadLong '短分区表的第一个扇区ID
cf.Header.MiniFATSectorsCount = cf.r.ReadLong '短分区表扇区总数
cf.Header.FirstDIFATSID = cf.r.ReadLong '主分区表的第一个扇区ID
cf.Header.DIFATSectorsCount = cf.r.ReadLong '分区表的扇区总数
'主分区表前109个记录
For i = 0 To 109 - 1
cf.Header.DIFATS(i) = cf.r.ReadLong
Next
If cf.Header.ByteOrder <> -2 Then
parseCfHeader = "复合文档:memory endian 不是小端"
Exit Function
End If
If cf.Header.MiniSectorShift <> 6 Then
parseCfHeader = "复合文档:The sector size of the Mini Stream MUST be 64 bytes"
Exit Function
End If
If cf.Header.MajorVersion = 3 Then
If cf.Header.SectorShift <> &H9 Then
parseCfHeader = "复合文档:If Major Version is 3, the Sector Shift MUST be 0x0009, specifying a sector size of 512 bytes."
Exit Function
End If
ElseIf cf.Header.MajorVersion = 4 Then
If cf.Header.SectorShift <> &HC Then
parseCfHeader = "复合文档:If Major Version is 4, the Sector Shift MUST be 0x000C, specifying a sector size of 4,096 bytes."
Exit Function
End If
Else
parseCfHeader = "复合文档:Major Version must by 3 or 4."
Exit Function
End If
cf.lSectorSize = 2 ^ cf.Header.SectorShift
cf.longNumPerSector = cf.lSectorSize \ 4
End Function解析DIFAT数组
'读取主扇区配置表(Main allocator of space within the compound file)
'是一个SID数组
'数组的值n代表FAT占用的第n个Sector扇区
Private Function parseDIFAT() As String
Dim i As Long
Dim next_SID As Long
Dim flag As Boolean
Dim count As Long
ReDim cf.DIFAT(cf.Header.FATSectorsCount - 1) As Long
'获取头文件中的109个
For i = 0 To 109 - 1
If cf.Header.DIFATS(i) = -1 Then
'头中并没有109个,小于6.875M的文件肯定是没有的
Exit Function
End If
cf.DIFAT(i) = cf.Header.DIFATS(i)
Next i
'获取另外的
count = 109
next_SID = cf.Header.FirstDIFATSID
flag = True
Dim tmp As Long
Do
'设置读取的位置
cf.r.SeekFile getOffsetBySID(next_SID), OriginF
'每一个sector扇区,512字节(128×4),能够存储127个DIFAT,第128个指向下一个SID(sector ID)
For i = 0 To cf.longNumPerSector - 1 - 1
tmp = cf.r.ReadLong()
If tmp = Free_SID Then
flag = False
Exit For
End If
cf.DIFAT(count) = tmp
count = count + 1
Next i
next_SID = cf.r.ReadLong() 'SID的最后4个字节存储再下一个的SID
Loop While flag
End Function03
解析FAT数组
通过DIFAT来解析FAT数组,FAT数组是构建扇区链表的重要数据信息:
'读取扇区配置表(Used to locate FAT sectors in the compound file)
'FAT是扇区编号的数组
'数组的值代表下一个SID
Private Function parseFAT() As String
Dim i As Long, j As Long
Dim count As Long
'每一个sector扇区,512字节(128×4),能够存储128个FAT
ReDim cf.FAT(cf.Header.FATSectorsCount * cf.longNumPerSector - 1) As Long
count = 0
For i = 0 To cf.Header.FATSectorsCount - 1
'设置读取的位置
cf.r.SeekFile getOffsetBySID(cf.DIFAT(i)), OriginF
For j = 0 To cf.longNumPerSector - 1
cf.FAT(count) = cf.r.ReadLong()
count = count + 1
Next j
Next i
End FunctionFAT数组解析出来之后,就可以去解析目录信息和MiniSAT了,因为目录信息和MiniSAT的数据是很有可能需要多个扇区储存的,必然需要扇区链接的信息,所以FAT数组必须要先进行解析。