Text.php
6.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
<?php
namespace Faker\Provider;
abstract class Text extends Base
{
protected static $baseText = '';
protected static $separator = ' ';
protected static $separatorLen = 1;
protected $explodedText;
protected $consecutiveWords = [];
protected static $textStartsWithUppercase = true;
/**
* Generate a text string by the Markov chain algorithm.
*
* Depending on the $maxNbChars, returns a random valid looking text. The algorithm
* generates a weighted table with the specified number of words as the index and the
* possible following words as the value.
*
* @example 'Alice, swallowing down her flamingo, and began by taking the little golden key'
*
* @param int $maxNbChars Maximum number of characters the text should contain (minimum: 10)
* @param int $indexSize Determines how many words are considered for the generation of the next word.
* The minimum is 1, and it produces a higher level of randomness, although the
* generated text usually doesn't make sense. Higher index sizes (up to 5)
* produce more correct text, at the price of less randomness.
*
* @return string
*/
public function realText($maxNbChars = 200, $indexSize = 2)
{
return $this->realTextBetween((int) round($maxNbChars * 0.8), $maxNbChars, $indexSize);
}
/**
* Generate a text string by the Markov chain algorithm.
*
* Depending on the $maxNbChars, returns a random valid looking text. The algorithm
* generates a weighted table with the specified number of words as the index and the
* possible following words as the value.
*
* @example 'Alice, swallowing down her flamingo, and began by taking the little golden key'
*
* @param int $minNbChars Minimum number of characters the text should contain (maximum: 8)
* @param int $maxNbChars Maximum number of characters the text should contain (minimum: 10)
* @param int $indexSize Determines how many words are considered for the generation of the next word.
* The minimum is 1, and it produces a higher level of randomness, although the
* generated text usually doesn't make sense. Higher index sizes (up to 5)
* produce more correct text, at the price of less randomness.
*
* @return string
*/
public function realTextBetween($minNbChars = 160, $maxNbChars = 200, $indexSize = 2)
{
if ($minNbChars < 1) {
throw new \InvalidArgumentException('minNbChars must be at least 1');
}
if ($maxNbChars < 10) {
throw new \InvalidArgumentException('maxNbChars must be at least 10');
}
if ($indexSize < 1) {
throw new \InvalidArgumentException('indexSize must be at least 1');
}
if ($indexSize > 5) {
throw new \InvalidArgumentException('indexSize must be at most 5');
}
if ($minNbChars >= $maxNbChars) {
throw new \InvalidArgumentException('minNbChars must be smaller than maxNbChars');
}
$words = $this->getConsecutiveWords($indexSize);
$iterations = 0;
do {
++$iterations;
if ($iterations >= 100) {
throw new \OverflowException(sprintf('Maximum retries of %d reached without finding a valid real text', $iterations));
}
$result = $this->generateText($maxNbChars, $words);
} while (static::strlen($result) <= $minNbChars);
return $result;
}
/**
* @param int $maxNbChars
* @param array $words
*
* @return string
*/
protected function generateText($maxNbChars, $words)
{
$result = [];
$resultLength = 0;
// take a random starting point
$next = static::randomKey($words);
while ($resultLength < $maxNbChars && isset($words[$next])) {
// fetch a random word to append
$word = static::randomElement($words[$next]);
// calculate next index
$currentWords = static::explode($next);
$currentWords[] = $word;
array_shift($currentWords);
$next = static::implode($currentWords);
// ensure text starts with an uppercase letter
if ($resultLength == 0 && !static::validStart($word)) {
continue;
}
// append the element
$result[] = $word;
$resultLength += static::strlen($word) + static::$separatorLen;
}
// remove the element that caused the text to overflow
array_pop($result);
// build result
$result = static::implode($result);
return static::appendEnd($result);
}
protected function getConsecutiveWords($indexSize)
{
if (!isset($this->consecutiveWords[$indexSize])) {
$parts = $this->getExplodedText();
$words = [];
$index = [];
for ($i = 0; $i < $indexSize; ++$i) {
$index[] = array_shift($parts);
}
for ($i = 0, $count = count($parts); $i < $count; ++$i) {
$stringIndex = static::implode($index);
if (!isset($words[$stringIndex])) {
$words[$stringIndex] = [];
}
$word = $parts[$i];
$words[$stringIndex][] = $word;
array_shift($index);
$index[] = $word;
}
// cache look up words for performance
$this->consecutiveWords[$indexSize] = $words;
}
return $this->consecutiveWords[$indexSize];
}
protected function getExplodedText()
{
if ($this->explodedText === null) {
$this->explodedText = static::explode(preg_replace('/\s+/u', ' ', static::$baseText));
}
return $this->explodedText;
}
protected static function explode($text)
{
return explode(static::$separator, $text);
}
protected static function implode($words)
{
return implode(static::$separator, $words);
}
protected static function strlen($text)
{
return function_exists('mb_strlen') ? mb_strlen($text, 'UTF-8') : strlen($text);
}
protected static function validStart($word)
{
$isValid = true;
if (static::$textStartsWithUppercase) {
$isValid = preg_match('/^\p{Lu}/u', $word);
}
return $isValid;
}
protected static function appendEnd($text)
{
return preg_replace("/([ ,-:;\x{2013}\x{2014}]+$)/us", '', $text) . '.';
}
}