Files
server/shared/lib/Blob/Signature.php
2026-02-10 18:46:11 -05:00

231 lines
6.3 KiB
PHP

<?php
declare(strict_types=1);
/**
* SPDX-FileCopyrightText: Sebastian Krupinski <krupinski01@gmail.com>
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
namespace KTXF\Blob;
use finfo;
/**
* Signature - Analyzes binary content to determine MIME type and format
*
* This utility only requires the first bytes of a file to detect its format,
* making it compatible with streams, chunked uploads, and remote storage backends like S3.
*
* Uses PHP's built-in finfo extension (libmagic) for reliable detection with
* fallback to custom magic byte detection if finfo is unavailable.
*/
class Signature {
/** Minimum bytes needed for reliable detection */
public const HEADER_SIZE = 256;
/**
* Fallback magic byte signatures for when finfo is unavailable
*/
private const SIGNATURES = [
['offset' => 0, 'bytes' => 'FFD8FF', 'format' => 'jpeg'],
['offset' => 0, 'bytes' => '89504E470D0A1A0A', 'format' => 'png'],
['offset' => 0, 'bytes' => '47494638', 'format' => 'gif'],
['offset' => 0, 'bytes' => '25504446', 'format' => 'pdf'],
['offset' => 0, 'bytes' => '504B0304', 'format' => 'zip'],
['offset' => 0, 'bytes' => '1F8B08', 'format' => 'gzip'],
['offset' => 4, 'bytes' => '66747970', 'format' => 'mp4'],
['offset' => 0, 'bytes' => '494433', 'format' => 'mp3'],
['offset' => 0, 'bytes' => 'FFFB', 'format' => 'mp3'],
['offset' => 0, 'bytes' => '52494646', 'format' => 'riff'], // WAV/AVI/WEBP
];
/** Cached finfo instance */
private static ?finfo $finfo = null;
/**
* Detect both MIME type and format from content bytes in a single operation
*
* @param string $headerBytes First bytes of the file content (256 recommended)
* @return array{mime: string, format: string} Array with 'mime' and 'format' keys
*/
public static function detect(string $headerBytes): array {
if (strlen($headerBytes) === 0) {
return ['mime' => MimeTypes::MIME_BINARY, 'format' => MimeTypes::FORMAT_BINARY];
}
$mime = null;
$format = null;
// Try finfo first (most reliable)
if (extension_loaded('fileinfo')) {
$mime = self::detectMimeType($headerBytes);
if ($mime !== null) {
// Get format from MIME
$format = MimeTypes::toFormat($mime);
if ($format === null && $mime !== MimeTypes::MIME_BINARY) {
$format = MimeTypes::parseFormat($mime);
}
}
}
// Fallback to magic bytes if format not determined
if ($format === null) {
$format = self::detectFromMagicBytes($headerBytes);
}
// Ensure MIME type is set
if ($mime === null || $mime === MimeTypes::MIME_BINARY) {
$mime = MimeTypes::toMime($format) ?? MimeTypes::MIME_BINARY;
}
return ['mime' => $mime, 'format' => $format];
}
/**
* Detect both MIME type and format from a stream in a single operation
*
* @param resource $stream File stream
* @return array{mime: string, format: string} Array with 'mime' and 'format' keys
*/
public static function detectFromStream($stream): array {
$position = ftell($stream);
$headerBytes = fread($stream, self::HEADER_SIZE);
fseek($stream, $position);
if ($headerBytes === false || $headerBytes === '') {
return ['mime' => MimeTypes::MIME_BINARY, 'format' => MimeTypes::FORMAT_BINARY];
}
return self::detect($headerBytes);
}
/**
* Detect file format from content bytes
*
* @param string $headerBytes First bytes of the file content (256 recommended)
* @return string Detected format (e.g., 'jpeg', 'png', 'pdf') or 'binary' if unknown
*/
public static function detectFormat(string $headerBytes): string {
return self::detect($headerBytes)['format'];
}
/**
* Detect MIME type from content bytes using finfo
*
* @param string $headerBytes Content bytes
* @return string|null MIME type or null on failure
*/
public static function detectMimeType(string $headerBytes): ?string {
if (!extension_loaded('fileinfo')) {
return null;
}
if (self::$finfo === null) {
self::$finfo = new finfo(FILEINFO_MIME_TYPE);
}
$mime = self::$finfo->buffer($headerBytes);
return $mime !== false ? $mime : null;
}
/**
* Detect file format from a stream
*
* Reads the header bytes, detects format, and rewinds the stream.
*
* @param resource $stream File stream
* @return string Detected format
*/
public static function detectFormatFromStream($stream): string {
$position = ftell($stream);
$headerBytes = fread($stream, self::HEADER_SIZE);
fseek($stream, $position);
if ($headerBytes === false || $headerBytes === '') {
return MimeTypes::FORMAT_BINARY;
}
return self::detectFormat($headerBytes);
}
/**
* Detect MIME type from a stream
*
* @param resource $stream File stream
* @return string|null MIME type or null
*/
public static function detectMimeTypeFromStream($stream): ?string {
$position = ftell($stream);
$headerBytes = fread($stream, self::HEADER_SIZE);
fseek($stream, $position);
if ($headerBytes === false || $headerBytes === '') {
return null;
}
return self::detectMimeType($headerBytes);
}
/**
* Fallback detection using magic bytes
*
* @param string $headerBytes Content bytes
* @return string Detected format or 'binary'
*/
private static function detectFromMagicBytes(string $headerBytes): string {
$headerHex = strtoupper(bin2hex($headerBytes));
foreach (self::SIGNATURES as $sig) {
$offset = $sig['offset'] * 2;
$sigBytes = strtoupper($sig['bytes']);
$sigLength = strlen($sigBytes);
if (strlen($headerHex) < $offset + $sigLength) {
continue;
}
$slice = substr($headerHex, $offset, $sigLength);
if ($slice === $sigBytes) {
return $sig['format'];
}
}
// Check if likely text
if (self::isLikelyText($headerBytes)) {
return 'text';
}
return MimeTypes::FORMAT_BINARY;
}
/**
* Check if content appears to be text
*
* @param string $bytes Content bytes
* @return bool
*/
private static function isLikelyText(string $bytes): bool {
// Check for UTF-8 BOM
if (str_starts_with($bytes, "\xEF\xBB\xBF")) {
return true;
}
$length = min(strlen($bytes), 256);
$printableCount = 0;
for ($i = 0; $i < $length; $i++) {
$byte = ord($bytes[$i]);
if (($byte >= 32 && $byte <= 126) || $byte === 9 || $byte === 10 || $byte === 13) {
$printableCount++;
} elseif ($byte >= 128 && $byte <= 247) {
$printableCount++; // UTF-8 bytes
}
}
return ($printableCount / $length) > 0.9;
}
}