231 lines
6.3 KiB
PHP
231 lines
6.3 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
/**
|
|
* SPDX-FileCopyrightText: Sebastian Krupinski <krupinski01@gmail.com>
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
namespace KTXF\Blob;
|
|
|
|
use finfo;
|
|
|
|
/**
|
|
* Signature - Analyzes binary content to determine MIME type and format
|
|
*
|
|
* This utility only requires the first bytes of a file to detect its format,
|
|
* making it compatible with streams, chunked uploads, and remote storage backends like S3.
|
|
*
|
|
* Uses PHP's built-in finfo extension (libmagic) for reliable detection with
|
|
* fallback to custom magic byte detection if finfo is unavailable.
|
|
*/
|
|
class Signature {
|
|
|
|
/** Minimum bytes needed for reliable detection */
|
|
public const HEADER_SIZE = 256;
|
|
|
|
/**
|
|
* Fallback magic byte signatures for when finfo is unavailable
|
|
*/
|
|
private const SIGNATURES = [
|
|
['offset' => 0, 'bytes' => 'FFD8FF', 'format' => 'jpeg'],
|
|
['offset' => 0, 'bytes' => '89504E470D0A1A0A', 'format' => 'png'],
|
|
['offset' => 0, 'bytes' => '47494638', 'format' => 'gif'],
|
|
['offset' => 0, 'bytes' => '25504446', 'format' => 'pdf'],
|
|
['offset' => 0, 'bytes' => '504B0304', 'format' => 'zip'],
|
|
['offset' => 0, 'bytes' => '1F8B08', 'format' => 'gzip'],
|
|
['offset' => 4, 'bytes' => '66747970', 'format' => 'mp4'],
|
|
['offset' => 0, 'bytes' => '494433', 'format' => 'mp3'],
|
|
['offset' => 0, 'bytes' => 'FFFB', 'format' => 'mp3'],
|
|
['offset' => 0, 'bytes' => '52494646', 'format' => 'riff'], // WAV/AVI/WEBP
|
|
];
|
|
|
|
/** Cached finfo instance */
|
|
private static ?finfo $finfo = null;
|
|
|
|
/**
|
|
* Detect both MIME type and format from content bytes in a single operation
|
|
*
|
|
* @param string $headerBytes First bytes of the file content (256 recommended)
|
|
* @return array{mime: string, format: string} Array with 'mime' and 'format' keys
|
|
*/
|
|
public static function detect(string $headerBytes): array {
|
|
if (strlen($headerBytes) === 0) {
|
|
return ['mime' => MimeTypes::MIME_BINARY, 'format' => MimeTypes::FORMAT_BINARY];
|
|
}
|
|
|
|
$mime = null;
|
|
$format = null;
|
|
|
|
// Try finfo first (most reliable)
|
|
if (extension_loaded('fileinfo')) {
|
|
$mime = self::detectMimeType($headerBytes);
|
|
if ($mime !== null) {
|
|
// Get format from MIME
|
|
$format = MimeTypes::toFormat($mime);
|
|
if ($format === null && $mime !== MimeTypes::MIME_BINARY) {
|
|
$format = MimeTypes::parseFormat($mime);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback to magic bytes if format not determined
|
|
if ($format === null) {
|
|
$format = self::detectFromMagicBytes($headerBytes);
|
|
}
|
|
|
|
// Ensure MIME type is set
|
|
if ($mime === null || $mime === MimeTypes::MIME_BINARY) {
|
|
$mime = MimeTypes::toMime($format) ?? MimeTypes::MIME_BINARY;
|
|
}
|
|
|
|
return ['mime' => $mime, 'format' => $format];
|
|
}
|
|
|
|
/**
|
|
* Detect both MIME type and format from a stream in a single operation
|
|
*
|
|
* @param resource $stream File stream
|
|
* @return array{mime: string, format: string} Array with 'mime' and 'format' keys
|
|
*/
|
|
public static function detectFromStream($stream): array {
|
|
$position = ftell($stream);
|
|
$headerBytes = fread($stream, self::HEADER_SIZE);
|
|
fseek($stream, $position);
|
|
|
|
if ($headerBytes === false || $headerBytes === '') {
|
|
return ['mime' => MimeTypes::MIME_BINARY, 'format' => MimeTypes::FORMAT_BINARY];
|
|
}
|
|
|
|
return self::detect($headerBytes);
|
|
}
|
|
|
|
/**
|
|
* Detect file format from content bytes
|
|
*
|
|
* @param string $headerBytes First bytes of the file content (256 recommended)
|
|
* @return string Detected format (e.g., 'jpeg', 'png', 'pdf') or 'binary' if unknown
|
|
*/
|
|
public static function detectFormat(string $headerBytes): string {
|
|
return self::detect($headerBytes)['format'];
|
|
}
|
|
|
|
/**
|
|
* Detect MIME type from content bytes using finfo
|
|
*
|
|
* @param string $headerBytes Content bytes
|
|
* @return string|null MIME type or null on failure
|
|
*/
|
|
public static function detectMimeType(string $headerBytes): ?string {
|
|
if (!extension_loaded('fileinfo')) {
|
|
return null;
|
|
}
|
|
|
|
if (self::$finfo === null) {
|
|
self::$finfo = new finfo(FILEINFO_MIME_TYPE);
|
|
}
|
|
|
|
$mime = self::$finfo->buffer($headerBytes);
|
|
return $mime !== false ? $mime : null;
|
|
}
|
|
|
|
/**
|
|
* Detect file format from a stream
|
|
*
|
|
* Reads the header bytes, detects format, and rewinds the stream.
|
|
*
|
|
* @param resource $stream File stream
|
|
* @return string Detected format
|
|
*/
|
|
public static function detectFormatFromStream($stream): string {
|
|
$position = ftell($stream);
|
|
$headerBytes = fread($stream, self::HEADER_SIZE);
|
|
fseek($stream, $position);
|
|
|
|
if ($headerBytes === false || $headerBytes === '') {
|
|
return MimeTypes::FORMAT_BINARY;
|
|
}
|
|
|
|
return self::detectFormat($headerBytes);
|
|
}
|
|
|
|
/**
|
|
* Detect MIME type from a stream
|
|
*
|
|
* @param resource $stream File stream
|
|
* @return string|null MIME type or null
|
|
*/
|
|
public static function detectMimeTypeFromStream($stream): ?string {
|
|
$position = ftell($stream);
|
|
$headerBytes = fread($stream, self::HEADER_SIZE);
|
|
fseek($stream, $position);
|
|
|
|
if ($headerBytes === false || $headerBytes === '') {
|
|
return null;
|
|
}
|
|
|
|
return self::detectMimeType($headerBytes);
|
|
}
|
|
|
|
/**
|
|
* Fallback detection using magic bytes
|
|
*
|
|
* @param string $headerBytes Content bytes
|
|
* @return string Detected format or 'binary'
|
|
*/
|
|
private static function detectFromMagicBytes(string $headerBytes): string {
|
|
$headerHex = strtoupper(bin2hex($headerBytes));
|
|
|
|
foreach (self::SIGNATURES as $sig) {
|
|
$offset = $sig['offset'] * 2;
|
|
$sigBytes = strtoupper($sig['bytes']);
|
|
$sigLength = strlen($sigBytes);
|
|
|
|
if (strlen($headerHex) < $offset + $sigLength) {
|
|
continue;
|
|
}
|
|
|
|
$slice = substr($headerHex, $offset, $sigLength);
|
|
if ($slice === $sigBytes) {
|
|
return $sig['format'];
|
|
}
|
|
}
|
|
|
|
// Check if likely text
|
|
if (self::isLikelyText($headerBytes)) {
|
|
return 'text';
|
|
}
|
|
|
|
return MimeTypes::FORMAT_BINARY;
|
|
}
|
|
|
|
/**
|
|
* Check if content appears to be text
|
|
*
|
|
* @param string $bytes Content bytes
|
|
* @return bool
|
|
*/
|
|
private static function isLikelyText(string $bytes): bool {
|
|
// Check for UTF-8 BOM
|
|
if (str_starts_with($bytes, "\xEF\xBB\xBF")) {
|
|
return true;
|
|
}
|
|
|
|
$length = min(strlen($bytes), 256);
|
|
$printableCount = 0;
|
|
|
|
for ($i = 0; $i < $length; $i++) {
|
|
$byte = ord($bytes[$i]);
|
|
if (($byte >= 32 && $byte <= 126) || $byte === 9 || $byte === 10 || $byte === 13) {
|
|
$printableCount++;
|
|
} elseif ($byte >= 128 && $byte <= 247) {
|
|
$printableCount++; // UTF-8 bytes
|
|
}
|
|
}
|
|
|
|
return ($printableCount / $length) > 0.9;
|
|
}
|
|
|
|
}
|