Skip to content

Commit

Permalink
Merge pull request #16 from nojimage/develop
Browse files Browse the repository at this point in the history
Refactor Regex class
  • Loading branch information
nojimage authored Dec 28, 2017
2 parents 4c3d764 + c763596 commit a2a1fb1
Show file tree
Hide file tree
Showing 10 changed files with 2,694 additions and 284 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ php:
- 5.6
- 7.0
- 7.1
- 7.2

dist: trusty
sudo: false
Expand Down
18 changes: 13 additions & 5 deletions lib/Twitter/Text/Autolink.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
class Autolink extends Regex
class Autolink
{

/**
Expand Down Expand Up @@ -139,6 +139,13 @@ class Autolink extends Regex
*/
protected $extractor = null;

/**
* The tweet to be used in parsing.
*
* @var string
*/
protected $tweet = '';

/**
* Provides fluent method chaining.
*
Expand Down Expand Up @@ -170,13 +177,14 @@ public function __construct($tweet = null, $escape = true, $full_encode = false)
{
if ($escape && !empty($tweet)) {
if ($full_encode) {
parent::__construct(htmlentities($tweet, ENT_QUOTES, 'UTF-8', false));
$this->tweet = htmlentities($tweet, ENT_QUOTES, 'UTF-8', false);
} else {
parent::__construct(htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false));
$this->tweet = htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false);
}
} else {
parent::__construct($tweet);
$this->tweet = $tweet;
}

$this->extractor = Extractor::create();
}

Expand Down Expand Up @@ -650,7 +658,7 @@ public function linkToHashtag($entity, $tweet = null)
if (!empty($this->class_hash)) {
$class[] = $this->class_hash;
}
if (preg_match(self::$patterns['rtl_chars'], $linkText)) {
if (preg_match(Regex::getRtlCharsMatcher(), $linkText)) {
$class[] = 'rtl';
}
if (!empty($class)) {
Expand Down
39 changes: 23 additions & 16 deletions lib/Twitter/Text/Extractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,21 @@
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
class Extractor extends Regex
class Extractor
{

/**
* @var boolean
*/
protected $extractURLWithoutProtocol = true;

/**
* The tweet to be used in parsing.
*
* @var string
*/
protected $tweet = '';

/**
* Provides fluent method chaining.
*
Expand All @@ -60,7 +67,7 @@ public static function create($tweet = null)
*/
public function __construct($tweet = null)
{
parent::__construct($tweet);
$this->tweet = $tweet;
}

/**
Expand Down Expand Up @@ -205,9 +212,9 @@ public function extractReplyScreenname($tweet = null)
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches);
$matched = preg_match(Regex::getValidReplyMatcher(), $tweet, $matches);
# Check username ending in
if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) {
if ($matched && preg_match(Regex::getEndMentionMatcher(), $matches[2])) {
$matched = false;
}
return $matched ? $matches[1] : null;
Expand Down Expand Up @@ -243,15 +250,15 @@ public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = tru
return array();
}

preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
preg_match_all(Regex::getValidHashtagMatcher(), $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
$tags = array();

foreach ($matches as $match) {
list($all, $before, $hash, $hashtag, $outer) = array_pad($match, 3, array('', 0));
$start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1];
$end_position = $start_position + StringUtils::strlen($hash[0] . $hashtag[0]);

if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
if (preg_match(Regex::getEndHashtagMatcher(), $outer[0])) {
continue;
}

Expand Down Expand Up @@ -296,15 +303,15 @@ public function extractCashtagsWithIndices($tweet = null)
return array();
}

preg_match_all(self::$patterns['valid_cashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
preg_match_all(Regex::getValidCashtagMatcher(), $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
$tags = array();

foreach ($matches as $match) {
list($all, $before, $dollar, $cash_text, $outer) = array_pad($match, 3, array('', 0));
$start_position = $dollar[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $dollar[1])) : $dollar[1];
$end_position = $start_position + StringUtils::strlen($dollar[0] . $cash_text[0]);

if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
if (preg_match(Regex::getEndHashtagMatcher(), $outer[0])) {
continue;
}

Expand Down Expand Up @@ -335,7 +342,7 @@ public function extractURLsWithIndices($tweet = null)
}

$urls = array();
preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
preg_match_all(Regex::getValidUrlMatcher(), $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);

foreach ($matches as $match) {
list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, array(''));
Expand All @@ -354,14 +361,14 @@ public function extractURLsWithIndices($tweet = null)
// If protocol is missing and domain contains non-ASCII characters,
// extract ASCII-only domains.
if (empty($protocol)) {
if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) {
if (!$this->extractURLWithoutProtocol || preg_match(Regex::getInvalidUrlWithoutProtocolPrecedingCharsMatcher(), $before)) {
continue;
}

$last_url = null;
$ascii_end_position = 0;

if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) {
if (preg_match(Regex::getValidAsciiDomainMatcher(), $domain, $asciiDomain)) {
$asciiDomain[0] = preg_replace('/' . preg_quote($domain, '/') . '/u', $asciiDomain[0], $url);
$ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position);
$ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]);
Expand All @@ -370,8 +377,8 @@ public function extractURLsWithIndices($tweet = null)
'indices' => array($start_position + $ascii_start_position, $start_position + $ascii_end_position),
);
if (!empty($path)
|| preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0])
|| !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) {
|| preg_match(Regex::getValidSpecialShortDomainMatcher(), $asciiDomain[0])
|| !preg_match(Regex::getInvalidCharactersMatcher(), $asciiDomain[0])) {
$urls[] = $last_url;
}
}
Expand All @@ -389,7 +396,7 @@ public function extractURLsWithIndices($tweet = null)
}
} else {
// In the case of t.co URLs, don't allow additional path characters
if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) {
if (preg_match(Regex::getValidTcoUrlMatcher(), $url, $tcoUrlMatches)) {
$url = $tcoUrlMatches[0];
$end_position = $start_position + StringUtils::strlen($url);
}
Expand Down Expand Up @@ -453,7 +460,7 @@ public function extractMentionsOrListsWithIndices($tweet = null)
return array();
}

preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
preg_match_all(Regex::getValidMentionsOrListsMatcher(), $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
$results = array();

foreach ($matches as $match) {
Expand All @@ -466,7 +473,7 @@ public function extractMentionsOrListsWithIndices($tweet = null)
'indices' => array($start_position, $end_position),
);

if (preg_match(self::$patterns['end_mention_match'], $outer[0])) {
if (preg_match(Regex::getEndMentionMatcher(), $outer[0])) {
continue;
}

Expand Down
15 changes: 11 additions & 4 deletions lib/Twitter/Text/HitHighlighter.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
class HitHighlighter extends Regex
class HitHighlighter
{

/**
Expand All @@ -37,6 +37,13 @@ class HitHighlighter extends Regex
*/
protected $tag = 'em';

/**
* The tweet to be used in parsing.
*
* @var string
*/
protected $tweet = '';

/**
* Provides fluent method chaining.
*
Expand Down Expand Up @@ -67,12 +74,12 @@ public function __construct($tweet = null, $escape = true, $full_encode = false)
{
if (!empty($tweet) && $escape) {
if ($full_encode) {
parent::__construct(htmlentities($tweet, ENT_QUOTES, 'UTF-8', false));
$this->tweet = htmlentities($tweet, ENT_QUOTES, 'UTF-8', false);
} else {
parent::__construct(htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false));
$this->tweet = htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false);
}
} else {
parent::__construct($tweet);
$this->tweet = $tweet;
}
}

Expand Down
16 changes: 8 additions & 8 deletions lib/Twitter/Text/LooseAutolink.php
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public function addLinks()
public function addLinksToHashtags()
{
return preg_replace_callback(
self::$patterns['valid_hashtag'],
Regex::getValidHashtagMatcher(),
array($this, '_addLinksToHashtags'),
$this->tweet
);
Expand All @@ -148,7 +148,7 @@ public function addLinksToHashtags()
public function addLinksToCashtags()
{
return preg_replace_callback(
self::$patterns['valid_cashtag'],
Regex::getValidCashtagMatcher(),
array($this, '_addLinksToCashtags'),
$this->tweet
);
Expand All @@ -161,7 +161,7 @@ public function addLinksToCashtags()
*/
public function addLinksToURLs()
{
return preg_replace_callback(self::$patterns['valid_url'], array($this, '_addLinksToURLs'), $this->tweet);
return preg_replace_callback(Regex::getValidUrlMatcher(), array($this, '_addLinksToURLs'), $this->tweet);
}

/**
Expand All @@ -172,7 +172,7 @@ public function addLinksToURLs()
public function addLinksToUsernamesAndLists()
{
return preg_replace_callback(
self::$patterns['valid_mentions_or_lists'],
Regex::getValidMentionsOrListsMatcher(),
array($this, '_addLinksToUsernamesAndLists'),
$this->tweet
);
Expand Down Expand Up @@ -261,15 +261,15 @@ protected function wrapHash($url, $class, $element)
protected function _addLinksToHashtags($matches)
{
list($all, $before, $hash, $tag, $after) = array_pad($matches, 5, '');
if (preg_match(self::$patterns['end_hashtag_match'], $after)
if (preg_match(Regex::getEndHashtagMatcher(), $after)
|| (!preg_match('!\A["\']!', $before) && preg_match('!\A["\']!', $after)) || preg_match('!\A</!', $after)) {
return $all;
}
$replacement = $before;
$element = $hash . $tag;
$url = $this->url_base_hash . $tag;
$class_hash = $this->class_hash;
if (preg_match(self::$patterns['rtl_chars'], $element)) {
if (preg_match(Regex::getRtlCharsMatcher(), $element)) {
$class_hash .= ' rtl';
}
$replacement .= $this->wrapHash($url, $class_hash, $element);
Expand All @@ -286,7 +286,7 @@ protected function _addLinksToHashtags($matches)
protected function _addLinksToCashtags($matches)
{
list($all, $before, $cash, $tag, $after) = array_pad($matches, 5, '');
if (preg_match(self::$patterns['end_cashtag_match'], $after)
if (preg_match(Regex::getEndCashtagMatcher(), $after)
|| (!preg_match('!\A["\']!', $before) && preg_match('!\A["\']!', $after)) || preg_match('!\A</!', $after)) {
return $all;
}
Expand Down Expand Up @@ -331,7 +331,7 @@ protected function _addLinksToUsernamesAndLists($matches)
$class = $this->class_list;
$url = $this->url_base_list . $element;
} else {
if (preg_match(self::$patterns['end_mention_match'], $after)) {
if (preg_match(Regex::getEndMentionMatcher(), $after)) {
return $all;
}
# Replace the username
Expand Down
Loading

0 comments on commit a2a1fb1

Please sign in to comment.