/
www
/
wwwroot
/
alo88.autos
/
wp-content
/
plugins
/
wp-content-crawler
/
app
/
Objects
/
Api
/
OpenAi
/
Tokenizer
/
Upload File
HOME
<?php /** * Created by PhpStorm. * User: tsaricam * Date: 04/02/2023 * Time: 23:11 * * @since 1.13.0 */ namespace WPCCrawler\Objects\Api\OpenAi\Tokenizer; use Illuminate\Contracts\Filesystem\FileNotFoundException; use WPCCrawler\Factory; use WPCCrawler\Objects\Enums\InformationType; use WPCCrawler\Objects\File\FileService; use WPCCrawler\Objects\Informing\Information; use WPCCrawler\Objects\Informing\Informer; /** * PHP implementation of https://github.com/latitudegames/GPT-3-Encoder/blob/master/Encoder.js * * Use {@link encode()} to retrieve the token IDs of a text. The result can be used to learn how many tokens a text * contains. If a need for recreating the original text arises, use {@link decode()} to create the original text. */ class Gpt3Tokenizer { const ENCODER_PATH = 'encoder.json'; const VOCAB_BPE_PATH = 'vocab.bpe'; /** @var Gpt3Tokenizer|null */ private static $instance = null; /** * @var string Regex pattern that matches tokens. This is not the pattern used by OpenAI's Tokenizer page. In that * page, there is an almost 20k-char regex pattern, which can be seen in one of the JS files loaded in the page. * Although this pattern does not match the same texts as the original tokenizer does, the results of this is pretty * close to the original one. It looks like this pattern results in more token IDs. If it found fewer tokens, it * could be a problem. However, finding more token IDs is not that big of a problem, since our goal is to learn how * many tokens a text contains so that we can arrange the contents of a request made to OpenAI API, to avoid errors * caused by sending too many tokens in one request. * This regex is retrieved from the NodeJS library whose link is added to the OpenAI's Tokenizer page as a * recommended way to access the original tokenizer programmatically. The NodeJS lib's file and the file from * HuggingFace's library are linked below as a reference. * * @see https://platform.openai.com/tokenizer * @see https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/tokenization_gpt2.py * @see https://github.com/openai/gpt-2/blob/master/src/encoder.py * @see https://github.com/latitudegames/GPT-3-Encoder/blob/master/Encoder.js */ private $pattern = "'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"; /** @var array<int, string>|null */ private $byteEncoder = null; /** @var array<string, int>|null */ private $byteDecoder = null; /** @var array<string, int>|null */ private $encoder = null; /** @var array<int, string>|null */ private $decoder = null; /** @var array<string, int>|null */ private $bpeRanks = null; /** @var array<string, string> */ private $cache = []; /** * @var string Used to separate the pairs. This separator character must NOT be one of the characters returned by * {@link createByteEncoder()} to avoid an infinite loop. */ private $separator = "˧"; /** * @return Gpt3Tokenizer The instance of the class * @since 1.13.0 */ public static function getInstance(): Gpt3Tokenizer { if (self::$instance === null) { self::$instance = new Gpt3Tokenizer(); } return self::$instance; } /* * */ /** This is a singleton. Use {@link Gpt3Tokenizer::getInstance()}. */ protected function __construct() { } /** * @param string[] $texts The texts whose sum of token count is wanted * @return int The number of tokens in the given texts in total * @since 1.13.0 */ public function getTokenCount(array $texts): int { $count = 0; foreach($texts as $text) { $count += count($this->encode($text)); } return $count; } /** * @param string $text The text to be encoded * @return int[] Token IDs * @since 1.13.0 */ public function encode(string $text): array { $bpeTokens = []; preg_match_all(sprintf('/%s/u', $this->pattern), $text, $matches); $tokens = $matches[0] ?? null; if (!$tokens) { return $bpeTokens; } $encoder = $this->getEncoder(); $byteEncoder = $this->getByteEncoder(); foreach($tokens as $token) { $parts = array_map(function($value) use (&$byteEncoder): string { return $byteEncoder[$value] ?? ' '; }, $this->encodeStr($token)); $token = implode('', $parts); $newTokens = array_map(function($x) use (&$encoder): int { return $encoder[$x] ?? 0; }, explode(' ', $this->bpe($token))); $bpeTokens = array_merge($bpeTokens, $newTokens); } return $bpeTokens; } /** * @param int[] $tokens The tokens retrieved via {@link encode()} * @return string Decoded tokens, which is the original text that was previously encoded * @since 1.13.0 */ public function decode(array $tokens): string { $decoder = $this->getDecoder(); $byteDecoder = $this->getByteDecoder(); $decodedParts = array_map(function($x) use (&$decoder): string { return $decoder[$x] ?? ' '; }, $tokens); $byteDecodedParts = array_map(function($x) use (&$byteDecoder): int { return $byteDecoder[$x] ?? 0; }, mb_str_split(implode('', $decodedParts))); return $this->decodeStr($byteDecodedParts); } /* * */ /** * @param string[] $word A character array whose character pairs are needed. This can be created via * {@link mb_str_split()} * @return string[] The character pairs of the word * @since 1.13.0 */ public function getPairs(array $word): array { $pairs = []; $length = count($word); if ($length < 2) { return []; } $prevChar = $word[0]; for($i = 1; $i < $length; $i++) { $char = $word[$i]; // The original implementation returns a tuple that then will be used as keys of an array. In PHP, we need // to use arrays instead of tuples. Because an array cannot be used as a key of an array, we instead create // a string that is created by joining the parts with "|" separator. $pairs[] = "{$prevChar}{$this->separator}{$char}"; $prevChar = $char; } return $pairs; } /** * @param string $str The string to be encoded to an unsigned 8-bit int array * @return int[] * @since 1.13.0 */ public function encodeStr(string $str): array { return unpack("C*", $str) ?: []; } /** * @param int[] $values Unsigned 8-bit int array * @return string * @since 1.13.0 */ public function decodeStr(array $values): string { $result = pack("C*", ...$values); return $result === false // @phpstan-ignore-line ? '' : $result; } /** * @return array<int, string> * @since 1.13.0 */ public function getByteEncoder(): array { if ($this->byteEncoder === null) { $this->byteEncoder = $this->createByteEncoder(); } return $this->byteEncoder; } /** * @return array<string, int> * @since 1.13.0 */ public function getByteDecoder(): array { if ($this->byteDecoder === null) { $this->byteDecoder = array_flip($this->getByteEncoder()); } return $this->byteDecoder; } /** * @return array<string, int> * @since 1.13.0 */ public function getEncoder(): array { if ($this->encoder === null) { $this->encoder = $this->readEncoderFile(); } return $this->encoder; } /** * @return array<int, string> * @since 1.13.0 */ public function getDecoder(): array { if ($this->decoder === null) { $this->decoder = array_flip($this->getEncoder()); } return $this->decoder; } /** * @return array<string, int> * @since 1.13.0 */ public function getBpeRanks(): ?array { if ($this->bpeRanks === null) { $this->bpeRanks = $this->createBpeRanks(); } return $this->bpeRanks; } /* * HELPERS */ /** * Byte-pair encoding (BPE) * * @param string $token * @return string * @since 1.13.0 */ protected function bpe(string $token): string { $cachedResult = $this->cache[$token] ?? null; if ($cachedResult !== null) { return $cachedResult; } /** @var string[] $word */ $word = mb_str_split($token); $pairs = $this->getPairs($word); if (!$pairs) { return $token; } $ranks = $this->getBpeRanks(); while (true) { /** @var array<int, string> $minPairs */ $minPairs = []; foreach($pairs as $pair) { $minPairs[$ranks[$pair] ?? 10e10] = $pair; } $bigram = $minPairs[min(array_keys($minPairs))]; if (!isset($ranks[$bigram])) { break; } $bigramArr = explode($this->separator, $bigram); if (!$bigramArr || count($bigramArr) < 2) { continue; } $first = $bigramArr[0]; $second = $bigramArr[1]; $newWord = []; $i = 0; $wordLength = count($word); while ($i < $wordLength) { $j = array_search($first, array_slice($word, $i)); if ($j === false) { $newWord = array_merge($newWord, array_slice($word, $i)); break; } $j = (int) $j; $j += $i; $newWord = array_merge($newWord, array_slice($word, $i, $j - $i)); $i = $j; if ($word[$i] === $first && $i < $wordLength - 1 && $word[$i + 1] === $second) { $newWord[] = $first . $second; $i += 2; } else { $newWord[] = $word[$i]; $i += 1; } } $word = $newWord; if (count($word) === 1) { break; } else { $pairs = $this->getPairs($word); } } $wordStr = implode(' ', $word); $this->cache[$token] = $wordStr; // Evict the earlier caches to free up some memory if (count($this->cache) > 16000) { $this->cache = array_slice($this->cache, 6000, null, true); } return $wordStr; } /** * @return array<string, int> * @since 1.13.0 */ protected function createBpeRanks(): array { $contents = $this->readFile(self::VOCAB_BPE_PATH); if ($contents === null) { return []; } $lines = explode("\n", $contents); $bpeMerges = array_slice($lines, 1, count($lines) - 2); $bpeMerges = array_map(function($line) { // The return value will later be used as a key of the result. In the original implementation, the return // values are Python tuples, which can be used as dictionary keys. In PHP, because there is no tuple data // type, we need to use arrays. However, arrays cannot be used as keys of an array. Therefore, instead of // returning an array, we join the parts of the array with "|" separator and return it instead, so that we // can use it as a key of the final array. return implode($this->separator, array_filter( preg_split('/\s+/', $line) ?: [], function(string $part) { return trim($part); }) ); }, $bpeMerges); return array_combine($bpeMerges, range(0, count($bpeMerges) - 1)) ?: []; } /** * Reads the encoder file from the file system * * @return array<string, int> The parsed encoder file * @since 1.13.0 */ protected function readEncoderFile(): array { $contents = $this->readFile(self::ENCODER_PATH); if ($contents === null) { return []; } $data = json_decode($contents, true); if (!is_array($data)) { $message = _wpcc('The encoder file for GPT-3 tokenizer could not be read.'); $info = new Information($message, json_last_error_msg(), InformationType::ERROR); Informer::add($info->addAsLog()); return []; } return $data; } /** * @return array<int, string> * @since 1.13.0 */ protected function createByteEncoder(): array { $bs = array_merge( range(mb_ord('!'), mb_ord('~')), range(mb_ord('¡'), mb_ord('¬')), range(mb_ord('®'), mb_ord('ÿ')), ); $cs = array_map(function($b) { return mb_chr($b); }, $bs); $n = 0; $length = 2 ** 8; for($b = 0; $b < $length; $b++) { if (in_array($b, $bs)) continue; $bs[] = $b; $cs[] = mb_chr($length + $n); $n += 1; } return array_combine($bs, $cs) ?: []; } /** * @param string $relativePath File path relative to the openai/data directory * @return string|null The contents of the file. If the file is not found, or it could not be read, returns `null`. * @since 1.13.0 */ protected function readFile(string $relativePath): ?string { $filePath = $this->getDataPath($relativePath); $fs = FileService::getInstance()->getFileSystem(); if (!$fs->exists($filePath) || !$fs->isFile($filePath)) { Informer::addError(sprintf(_wpcc('File "%1$s" could not be found.'), $filePath)) ->addAsLog(); return null; } try { $contents = $fs->get($filePath); } catch (FileNotFoundException $e) { Informer::addError($e->getMessage()) ->setException($e) ->addAsLog(); return null; } return $contents; } /** * @param string $relativeFilePath A file path relative to `data/openai/` directory. * @return string * @since 1.13.0 */ protected function getDataPath(string $relativeFilePath): string { return Factory::assetManager() ->pluginPath('data/openai/' . $relativeFilePath); } }