首页
学习
活动
专区
圈层
工具
发布

VBA解析复合文档03——解析扇区链表

通过前面对复合文档结构的分析,解析起来问题就不大了。

01

解析Header结构

Header结构一定是在文件的最开始处,所以直接一个一个字段的读取就行了:

代码语言:javascript
复制
Private Type CFHeader
    Signature(7) As Byte               '文档标识id
    CLSID(15) As Byte                  '文件唯一标识
    MinorVersion As Integer            '文件格式修订号
    MajorVersion As Integer            '文件格式版本号
    ByteOrder As Integer               'FFFE表示 Little-Endian
    SectorShift As Integer             '扇区的大小 2的幂 通常为2^9=512
    MiniSectorShift As Integer         '短扇区大小,2的幂,通常为2^6
    Reserved(5) As Byte
    DirSectorsCount As Long             '目录的数量
    FATSectorsCount As Long             '分区表扇区的总数
    FirstDirSID As Long                 '目录流第一个扇区的ID
    TransactionSignatureNumber  As Long '
    MiniStreamSize As Long              '最小标准流
    FirstMiniFATSID As Long             '短分区表的第一个扇区ID
    MiniFATSectorsCount As Long         '短分区表扇区总数
    FirstDIFATSID As Long               '主分区表的第一个扇区ID
    DIFATSectorsCount As Long           '分区表的扇区总数
    DIFATS(108) As Long                 '主分区表前109个记录
End Type

'读取文件头
Private Function parseCfHeader() As String
    Dim iret As Long
    
    iret = cf.r.Read(cf.Header.Signature)
    If iret <> 8 Then
        parseCfHeader = "复合文档:文件头id读取出错"
        Exit Function
    End If
    
    Dim arr()
    arr = Array(208, 207, 17, 224, 161, 177, 26, 225)
    Dim i As Long
    For i = 0 To 8 - 1
        If cf.Header.Signature(i) <> arr(i) Then
            parseCfHeader = "复合文档:文件头id出错"
            Exit Function
        End If
    Next
    
    cf.r.Read cf.Header.CLSID '文件唯一标识
    cf.Header.MinorVersion = cf.r.ReadInteger '文件格式修订号
    cf.Header.MajorVersion = cf.r.ReadInteger  '文件格式版本号
    cf.Header.ByteOrder = cf.r.ReadInteger        'FFFE表示 Little-Endian
    cf.Header.SectorShift = cf.r.ReadInteger          '扇区的大小 2的幂 通常为2^9=512
    cf.Header.MiniSectorShift = cf.r.ReadInteger    '短扇区大小,2的幂,通常为2^6
    cf.r.Read cf.Header.Reserved
    cf.Header.DirSectorsCount = cf.r.ReadLong()
    cf.Header.FATSectorsCount = cf.r.ReadLong               '分区表扇区的总数
    cf.Header.FirstDirSID = cf.r.ReadLong           '目录流第一个扇区的ID
    cf.Header.TransactionSignatureNumber = cf.r.ReadLong()
    cf.Header.MiniStreamSize = cf.r.ReadLong         '最小标准流
    cf.Header.FirstMiniFATSID = cf.r.ReadLong          '短分区表的第一个扇区ID
    cf.Header.MiniFATSectorsCount = cf.r.ReadLong              '短分区表扇区总数
    cf.Header.FirstDIFATSID = cf.r.ReadLong          '主分区表的第一个扇区ID
    cf.Header.DIFATSectorsCount = cf.r.ReadLong              '分区表的扇区总数
    '主分区表前109个记录
    For i = 0 To 109 - 1
        cf.Header.DIFATS(i) = cf.r.ReadLong
    Next
    
    If cf.Header.ByteOrder <> -2 Then
        parseCfHeader = "复合文档:memory endian 不是小端"
        Exit Function
    End If
    If cf.Header.MiniSectorShift <> 6 Then
        parseCfHeader = "复合文档:The sector size of the Mini Stream MUST be 64 bytes"
        Exit Function
    End If
    
    If cf.Header.MajorVersion = 3 Then
        If cf.Header.SectorShift <> &H9 Then
            parseCfHeader = "复合文档:If Major Version is 3, the Sector Shift MUST be 0x0009, specifying a sector size of 512 bytes."
            Exit Function
        End If
    
    ElseIf cf.Header.MajorVersion = 4 Then
        If cf.Header.SectorShift <> &HC Then
            parseCfHeader = "复合文档:If Major Version is 4, the Sector Shift MUST be 0x000C, specifying a sector size of 4,096 bytes."
            Exit Function
        End If
        
    Else
        parseCfHeader = "复合文档:Major Version must by 3 or 4."
        Exit Function
    End If
    
    cf.lSectorSize = 2 ^ cf.Header.SectorShift
    cf.longNumPerSector = cf.lSectorSize \ 4
End Function

解析DIFAT数组

代码语言:javascript
复制
'读取主扇区配置表(Main allocator of space within the compound file)
'是一个SID数组
'数组的值n代表FAT占用的第n个Sector扇区
Private Function parseDIFAT() As String
    Dim i As Long
    Dim next_SID As Long
    Dim flag As Boolean
    Dim count As Long

    ReDim cf.DIFAT(cf.Header.FATSectorsCount - 1) As Long

    '获取头文件中的109个
    For i = 0 To 109 - 1
        If cf.Header.DIFATS(i) = -1 Then
            '头中并没有109个,小于6.875M的文件肯定是没有的
            Exit Function
        End If

        cf.DIFAT(i) = cf.Header.DIFATS(i)
    Next i

    '获取另外的
    count = 109
    next_SID = cf.Header.FirstDIFATSID
    flag = True
    Dim tmp As Long
    
    Do
        '设置读取的位置
        cf.r.SeekFile getOffsetBySID(next_SID), OriginF
        '每一个sector扇区,512字节(128×4),能够存储127个DIFAT,第128个指向下一个SID(sector ID)
        For i = 0 To cf.longNumPerSector - 1 - 1
            tmp = cf.r.ReadLong()
            If tmp = Free_SID Then
                flag = False
                Exit For
            End If

            cf.DIFAT(count) = tmp
            count = count + 1
        Next i
        
        next_SID = cf.r.ReadLong()  'SID的最后4个字节存储再下一个的SID
    Loop While flag

End Function

03

解析FAT数组

通过DIFAT来解析FAT数组,FAT数组是构建扇区链表的重要数据信息:

代码语言:javascript
复制
'读取扇区配置表(Used to locate FAT sectors in the compound file)
'FAT是扇区编号的数组
'数组的值代表下一个SID
Private Function parseFAT() As String
    Dim i As Long, j As Long
    Dim count As Long

    '每一个sector扇区,512字节(128×4),能够存储128个FAT
    ReDim cf.FAT(cf.Header.FATSectorsCount * cf.longNumPerSector - 1) As Long

    count = 0
    For i = 0 To cf.Header.FATSectorsCount - 1
        '设置读取的位置
        cf.r.SeekFile getOffsetBySID(cf.DIFAT(i)), OriginF
        
        For j = 0 To cf.longNumPerSector - 1
            cf.FAT(count) = cf.r.ReadLong()
            count = count + 1
        Next j
    Next i
End Function

FAT数组解析出来之后,就可以去解析目录信息和MiniSAT了,因为目录信息和MiniSAT的数据是很有可能需要多个扇区储存的,必然需要扇区链接的信息,所以FAT数组必须要先进行解析。

下一篇
举报
领券