228 lines
8.3 KiB
Python
Raw Normal View History

2024-01-23 13:52:41 -05:00
#-----------------------------------------------------------------------------
# Copyright (c) 2013-2023, PyInstaller Development Team.
#
# Distributed under the terms of the GNU General Public License (version 2
# or later) with exception for distributing the bootloader.
#
# The full license is in the file COPYING.txt, distributed with this software.
#
# SPDX-License-Identifier: (GPL-2.0-or-later WITH Bootloader-exception)
#-----------------------------------------------------------------------------
"""
Python-based CArchive (PKG) reader implementation. Used only in the archive_viewer utility.
"""
import os
import struct
from PyInstaller.loader.pyimod01_archive import ZlibArchiveReader, ArchiveReadError
class NotAnArchiveError(TypeError):
pass
# Type codes for CArchive TOC entries
PKG_ITEM_BINARY = 'b' # binary
PKG_ITEM_DEPENDENCY = 'd' # runtime option
PKG_ITEM_PYZ = 'z' # zlib (pyz) - frozen Python code
PKG_ITEM_ZIPFILE = 'Z' # zlib (pyz) - frozen Python code
PKG_ITEM_PYPACKAGE = 'M' # Python package (__init__.py)
PKG_ITEM_PYMODULE = 'm' # Python module
PKG_ITEM_PYSOURCE = 's' # Python script (v3)
PKG_ITEM_DATA = 'x' # data
PKG_ITEM_RUNTIME_OPTION = 'o' # runtime option
PKG_ITEM_SPLASH = 'l' # splash resources
class CArchiveReader:
"""
Reader for PyInstaller's CArchive (PKG) archive.
"""
# Cookie - holds some information for the bootloader. C struct format definition. '!' at the beginning means network
# byte order. C struct looks like:
#
# typedef struct _cookie {
# char magic[8]; /* 'MEI\014\013\012\013\016' */
# uint32_t len; /* len of entire package */
# uint32_t TOC; /* pos (rel to start) of TableOfContents */
# int TOClen; /* length of TableOfContents */
# int pyvers; /* new in v4 */
# char pylibname[64]; /* Filename of Python dynamic library. */
# } COOKIE;
#
_COOKIE_MAGIC_PATTERN = b'MEI\014\013\012\013\016'
_COOKIE_FORMAT = '!8sIIii64s'
_COOKIE_LENGTH = struct.calcsize(_COOKIE_FORMAT)
# TOC entry:
#
# typedef struct _toc {
# int structlen; /* len of this one - including full len of name */
# uint32_t pos; /* pos rel to start of concatenation */
# uint32_t len; /* len of the data (compressed) */
# uint32_t ulen; /* len of data (uncompressed) */
# char cflag; /* is it compressed (really a byte) */
# char typcd; /* type code -'b' binary, 'z' zlib, 'm' module,
# * 's' script (v3),'x' data, 'o' runtime option */
# char name[1]; /* the name to save it as */
# /* starting in v5, we stretch this out to a mult of 16 */
# } TOC;
#
_TOC_ENTRY_FORMAT = '!iIIIBB'
_TOC_ENTRY_LENGTH = struct.calcsize(_TOC_ENTRY_FORMAT)
def __init__(self, filename):
self._filename = filename
self._start_offset = 0
self._toc_offset = 0
self._toc_length = 0
self.toc = {}
self.options = []
# Load TOC
with open(self._filename, "rb") as fp:
# Find cookie MAGIC pattern
cookie_start_offset = self._find_magic_pattern(fp, self._COOKIE_MAGIC_PATTERN)
if cookie_start_offset == -1:
raise ArchiveReadError("Could not find COOKIE magic pattern!")
# Read the whole cookie
fp.seek(cookie_start_offset, os.SEEK_SET)
cookie_data = fp.read(self._COOKIE_LENGTH)
magic, archive_length, toc_offset, toc_length, pyvers, pylib_name = \
struct.unpack(self._COOKIE_FORMAT, cookie_data)
# Compute start of the the archive
self._start_offset = (cookie_start_offset + self._COOKIE_LENGTH) - archive_length
# Verify that Python shared library name is set
if not pylib_name:
raise ArchiveReadError("Python shared library name not set in the archive!")
# Read whole toc
fp.seek(self._start_offset + toc_offset)
toc_data = fp.read(toc_length)
self.toc, self.options = self._parse_toc(toc_data)
@staticmethod
def _find_magic_pattern(fp, magic_pattern):
# Start at the end of file, and scan back-to-start
fp.seek(0, os.SEEK_END)
end_pos = fp.tell()
# Scan from back
SEARCH_CHUNK_SIZE = 8192
magic_offset = -1
while end_pos >= len(magic_pattern):
start_pos = max(end_pos - SEARCH_CHUNK_SIZE, 0)
chunk_size = end_pos - start_pos
# Is the remaining chunk large enough to hold the pattern?
if chunk_size < len(magic_pattern):
break
# Read and scan the chunk
fp.seek(start_pos, os.SEEK_SET)
buf = fp.read(chunk_size)
pos = buf.rfind(magic_pattern)
if pos != -1:
magic_offset = start_pos + pos
break
# Adjust search location for next chunk; ensure proper overlap
end_pos = start_pos + len(magic_pattern) - 1
return magic_offset
@classmethod
def _parse_toc(cls, data):
options = []
toc = {}
cur_pos = 0
while cur_pos < len(data):
# Read and parse the fixed-size TOC entry header
entry_length, entry_offset, data_length, uncompressed_length, compression_flag, typecode = \
struct.unpack(cls._TOC_ENTRY_FORMAT, data[cur_pos:(cur_pos + cls._TOC_ENTRY_LENGTH)])
cur_pos += cls._TOC_ENTRY_LENGTH
# Read variable-length name
name_length = entry_length - cls._TOC_ENTRY_LENGTH
name, *_ = struct.unpack(f'{name_length}s', data[cur_pos:(cur_pos + name_length)])
cur_pos += name_length
# Name string may contain up to 15 bytes of padding
name = name.rstrip(b'\0').decode('utf-8')
typecode = chr(typecode)
# The TOC should not contain duplicates, except for OPTION entries. Therefore, keep those
# in a separate list. With options, the rest of the entries do not make sense, anyway.
if typecode == 'o':
options.append(name)
else:
toc[name] = (entry_offset, data_length, uncompressed_length, compression_flag, typecode)
return toc, options
def extract(self, name):
"""
Extract data for the given entry name.
"""
entry = self.toc.get(name)
if entry is None:
raise KeyError(f"No entry named {name} found in the archive!")
entry_offset, data_length, uncompressed_length, compression_flag, typecode = entry
with open(self._filename, "rb") as fp:
fp.seek(self._start_offset + entry_offset, os.SEEK_SET)
data = fp.read(data_length)
if compression_flag:
import zlib
data = zlib.decompress(data)
return data
def open_embedded_archive(self, name):
"""
Open new archive reader for the embedded archive.
"""
entry = self.toc.get(name)
if entry is None:
raise KeyError(f"No entry named {name} found in the archive!")
entry_offset, data_length, uncompressed_length, compression_flag, typecode = entry
if typecode == PKG_ITEM_PYZ:
# Open as embedded archive, without extraction.
return ZlibArchiveReader(self._filename, self._start_offset + entry_offset)
elif typecode == PKG_ITEM_ZIPFILE:
raise NotAnArchiveError("Zipfile archives not supported yet!")
else:
raise NotAnArchiveError(f"Entry {name} is not a supported embedded archive!")
def pkg_archive_contents(filename, recursive=True):
"""
List the contents of the PKG / CArchive. If `recursive` flag is set (the default), the contents of the embedded PYZ
archive is included as well.
Used by the tests.
"""
contents = []
pkg_archive = CArchiveReader(filename)
for name, toc_entry in pkg_archive.toc.items():
*_, typecode = toc_entry
contents.append(name)
if typecode == PKG_ITEM_PYZ and recursive:
pyz_archive = pkg_archive.open_embedded_archive(name)
for name in pyz_archive.toc.keys():
contents.append(name)
return contents