From c02e78f26d6a89db98f1c99d5b4f7ddb12e3248f Mon Sep 17 00:00:00 2001 From: nojimage Date: Mon, 18 Dec 2017 20:20:26 +0900 Subject: [PATCH 01/10] Move tld regex to TldLists class --- lib/Twitter/Text/Regex.php | 9 +- lib/Twitter/Text/TldLists.php | 1636 +++++++++++++++++++++++++++ tests/Twitter/Text/TldListsTest.php | 39 + 3 files changed, 1679 insertions(+), 5 deletions(-) create mode 100644 lib/Twitter/Text/TldLists.php create mode 100644 tests/Twitter/Text/TldListsTest.php diff --git a/lib/Twitter/Text/Regex.php b/lib/Twitter/Text/Regex.php index 5969f38..83c88ab 100644 --- a/lib/Twitter/Text/Regex.php +++ b/lib/Twitter/Text/Regex.php @@ -10,6 +10,8 @@ namespace Twitter\Text; +use Twitter\Text\TldLists; + /** * Twitter Regex Abstract Class * @@ -177,11 +179,8 @@ public static function __static() $tmp['valid_domain_name'] = '(?:(?:[' . $tmp['domain_valid_chars'] . '][' . $tmp['domain_valid_chars'] . '\-]*)?[' . $tmp['domain_valid_chars'] . ']\.)'; $tmp['domain_valid_unicode_chars'] = '[^\p{P}\p{Z}\p{C}' . $tmp['invalid_characters'] . $tmp['spaces'] . ']'; - $gTLD = '삼성|닷컴|닷넷|香格里拉|餐厅|食品|飞利浦|電訊盈科|集团|通販|购物|谷歌|诺基亚|联通|网络|网站|网店|网址|组织机构|移动|珠宝|点看|游戏|淡马锡|机构|書籍|时尚|新闻|政府|政务|手表|手机|我爱你|慈善|微博|广东|工行|家電|娱乐|天主教|大拿|大众汽车|在线|嘉里大酒店|嘉里|商标|商店|商城|公益|公司|八卦|健康|信息|佛山|企业|中文网|中信|世界|ポイント|ファッション|セール|ストア|コム|グーグル|クラウド|みんな|คอม|संगठन|नेट|कॉम|همراه|موقع|موبايلي|كوم|كاثوليك|عرب|شبكة|بيتك|بازار|العليان|ارامكو|اتصالات|ابوظبي|קום|сайт|рус|орг|онлайн|москва|ком|католик|дети|zuerich|zone|zippo|zip|zero|zara|zappos|yun|youtube|you|yokohama|yoga|yodobashi|yandex|yamaxun|yahoo|yachts|xyz|xxx|xperia|xin|xihuan|xfinity|xerox|xbox|wtf|wtc|wow|world|works|work|woodside|wolterskluwer|wme|winners|wine|windows|win|williamhill|wiki|wien|whoswho|weir|weibo|wedding|wed|website|weber|webcam|weatherchannel|weather|watches|watch|warman|wanggou|wang|walter|walmart|wales|vuelos|voyage|voto|voting|vote|volvo|volkswagen|vodka|vlaanderen|vivo|viva|vistaprint|vista|vision|visa|virgin|vip|vin|villas|viking|vig|video|viajes|vet|versicherung|vermögensberatung|vermögensberater|verisign|ventures|vegas|vanguard|vana|vacations|ups|uol|uno|university|unicom|uconnect|ubs|ubank|tvs|tushu|tunes|tui|tube|trv|trust|travelersinsurance|travelers|travelchannel|travel|training|trading|trade|toys|toyota|town|tours|total|toshiba|toray|top|tools|tokyo|today|tmall|tkmaxx|tjx|tjmaxx|tirol|tires|tips|tiffany|tienda|tickets|tiaa|theatre|theater|thd|teva|tennis|temasek|telefonica|telecity|tel|technology|tech|team|tdk|tci|taxi|tax|tattoo|tatar|tatamotors|target|taobao|talk|taipei|tab|systems|symantec|sydney|swiss|swiftcover|swatch|suzuki|surgery|surf|support|supply|supplies|sucks|style|study|studio|stream|store|storage|stockholm|stcgroup|stc|statoil|statefarm|statebank|starhub|star|staples|stada|srt|srl|spreadbetting|spot|spiegel|space|soy|sony|song|solutions|solar|sohu|software|softbank|social|soccer|sncf|smile|smart|sling|skype|sky|skin|ski|site|singles|sina|silk|shriram|showtime|show|shouji|shopping|shop|shoes|shiksha|shia|shell|shaw|sharp|shangrila|sfr|sexy|sex|sew|seven|ses|services|sener|select|seek|security|secure|seat|search|scot|scor|scjohnson|science|schwarz|schule|school|scholarships|schmidt|schaeffler|scb|sca|sbs|sbi|saxo|save|sas|sarl|sapo|sap|sanofi|sandvikcoromant|sandvik|samsung|samsclub|salon|sale|sakura|safety|safe|saarland|ryukyu|rwe|run|ruhr|rugby|rsvp|room|rogers|rodeo|rocks|rocher|rmit|rip|rio|ril|rightathome|ricoh|richardli|rich|rexroth|reviews|review|restaurant|rest|republican|report|repair|rentals|rent|ren|reliance|reit|reisen|reise|rehab|redumbrella|redstone|red|recipes|realty|realtor|realestate|read|raid|radio|racing|qvc|quest|quebec|qpon|pwc|pub|prudential|pru|protection|property|properties|promo|progressive|prof|productions|prod|pro|prime|press|praxi|pramerica|post|porn|politie|poker|pohl|pnc|plus|plumbing|playstation|play|place|pizza|pioneer|pink|ping|pin|pid|pictures|pictet|pics|piaget|physio|photos|photography|photo|phone|philips|phd|pharmacy|pfizer|pet|pccw|pay|passagens|party|parts|partners|pars|paris|panerai|panasonic|pamperedchef|page|ovh|ott|otsuka|osaka|origins|orientexpress|organic|org|orange|oracle|open|ooo|onyourside|online|onl|ong|one|omega|ollo|oldnavy|olayangroup|olayan|okinawa|office|off|observer|obi|nyc|ntt|nrw|nra|nowtv|nowruz|now|norton|northwesternmutual|nokia|nissay|nissan|ninja|nikon|nike|nico|nhk|ngo|nfl|nexus|nextdirect|next|news|newholland|new|neustar|network|netflix|netbank|net|nec|nba|navy|natura|nationwide|name|nagoya|nadex|nab|mutuelle|mutual|museum|mtr|mtpc|mtn|msd|movistar|movie|mov|motorcycles|moto|moscow|mortgage|mormon|mopar|montblanc|monster|money|monash|mom|moi|moe|moda|mobily|mobile|mobi|mma|mls|mlb|mitsubishi|mit|mint|mini|mil|microsoft|miami|metlife|merckmsd|meo|menu|men|memorial|meme|melbourne|meet|media|med|mckinsey|mcdonalds|mcd|mba|mattel|maserati|marshalls|marriott|markets|marketing|market|map|mango|management|man|makeup|maison|maif|madrid|macys|luxury|luxe|lupin|lundbeck|ltda|ltd|lplfinancial|lpl|love|lotto|lotte|london|lol|loft|locus|locker|loans|loan|lixil|living|live|lipsy|link|linde|lincoln|limo|limited|lilly|like|lighting|lifestyle|lifeinsurance|life|lidl|liaison|lgbt|lexus|lego|legal|lefrak|leclerc|lease|lds|lawyer|law|latrobe|latino|lat|lasalle|lanxess|landrover|land|lancome|lancia|lancaster|lamer|lamborghini|ladbrokes|lacaixa|kyoto|kuokgroup|kred|krd|kpn|kpmg|kosher|komatsu|koeln|kiwi|kitchen|kindle|kinder|kim|kia|kfh|kerryproperties|kerrylogistics|kerryhotels|kddi|kaufen|juniper|juegos|jprs|jpmorgan|joy|jot|joburg|jobs|jnj|jmp|jll|jlc|jio|jewelry|jetzt|jeep|jcp|jcb|java|jaguar|iwc|iveco|itv|itau|istanbul|ist|ismaili|iselect|irish|ipiranga|investments|intuit|international|intel|int|insure|insurance|institute|ink|ing|info|infiniti|industries|immobilien|immo|imdb|imamat|ikano|iinet|ifm|ieee|icu|ice|icbc|ibm|hyundai|hyatt|hughes|htc|hsbc|how|house|hotmail|hotels|hoteles|hot|hosting|host|hospital|horse|honeywell|honda|homesense|homes|homegoods|homedepot|holiday|holdings|hockey|hkt|hiv|hitachi|hisamitsu|hiphop|hgtv|hermes|here|helsinki|help|healthcare|health|hdfcbank|hdfc|hbo|haus|hangout|hamburg|hair|guru|guitars|guide|guge|gucci|guardian|group|grocery|gripe|green|gratis|graphics|grainger|gov|got|gop|google|goog|goodyear|goodhands|goo|golf|goldpoint|gold|godaddy|gmx|gmo|gmbh|gmail|globo|global|gle|glass|glade|giving|gives|gifts|gift|ggee|george|genting|gent|gea|gdn|gbiz|garden|gap|games|game|gallup|gallo|gallery|gal|fyi|futbol|furniture|fund|fun|fujixerox|fujitsu|ftr|frontier|frontdoor|frogans|frl|fresenius|free|fox|foundation|forum|forsale|forex|ford|football|foodnetwork|food|foo|fly|flsmidth|flowers|florist|flir|flights|flickr|fitness|fit|fishing|fish|firmdale|firestone|fire|financial|finance|final|film|fido|fidelity|fiat|ferrero|ferrari|feedback|fedex|fast|fashion|farmers|farm|fans|fan|family|faith|fairwinds|fail|fage|extraspace|express|exposed|expert|exchange|everbank|events|eus|eurovision|etisalat|esurance|estate|esq|erni|ericsson|equipment|epson|epost|enterprises|engineering|engineer|energy|emerck|email|education|edu|edeka|eco|eat|earth|dvr|dvag|durban|dupont|duns|dunlop|duck|dubai|dtv|drive|download|dot|doosan|domains|doha|dog|dodge|doctor|docs|dnp|diy|dish|discover|discount|directory|direct|digital|diet|diamonds|dhl|dev|design|desi|dentist|dental|democrat|delta|deloitte|dell|delivery|degree|deals|dealer|deal|dds|dclk|day|datsun|dating|date|data|dance|dad|dabur|cyou|cymru|cuisinella|csc|cruises|cruise|crs|crown|cricket|creditunion|creditcard|credit|courses|coupons|coupon|country|corsica|coop|cool|cookingchannel|cooking|contractors|contact|consulting|construction|condos|comsec|computer|compare|company|community|commbank|comcast|com|cologne|college|coffee|codes|coach|clubmed|club|cloud|clothing|clinique|clinic|click|cleaning|claims|cityeats|city|citic|citi|citadel|cisco|circle|cipriani|church|chrysler|chrome|christmas|chloe|chintai|cheap|chat|chase|channel|chanel|cfd|cfa|cern|ceo|center|ceb|cbs|cbre|cbn|cba|catholic|catering|cat|casino|cash|caseih|case|casa|cartier|cars|careers|career|care|cards|caravan|car|capitalone|capital|capetown|canon|cancerresearch|camp|camera|cam|calvinklein|call|cal|cafe|cab|bzh|buzz|buy|business|builders|build|bugatti|budapest|brussels|brother|broker|broadway|bridgestone|bradesco|box|boutique|bot|boston|bostik|bosch|boots|booking|book|boo|bond|bom|bofa|boehringer|boats|bnpparibas|bnl|bmw|bms|blue|bloomberg|blog|blockbuster|blanco|blackfriday|black|biz|bio|bingo|bing|bike|bid|bible|bharti|bet|bestbuy|best|berlin|bentley|beer|beauty|beats|bcn|bcg|bbva|bbt|bbc|bayern|bauhaus|basketball|baseball|bargains|barefoot|barclays|barclaycard|barcelona|bar|bank|band|bananarepublic|banamex|baidu|baby|azure|axa|aws|avianca|autos|auto|author|auspost|audio|audible|audi|auction|attorney|athleta|associates|asia|asda|arte|art|arpa|army|archi|aramco|arab|aquarelle|apple|app|apartments|aol|anz|anquan|android|analytics|amsterdam|amica|amfam|amex|americanfamily|americanexpress|alstom|alsace|ally|allstate|allfinanz|alipay|alibaba|alfaromeo|akdn|airtel|airforce|airbus|aigo|aig|agency|agakhan|africa|afl|afamilycompany|aetna|aero|aeg|adult|ads|adac|actor|active|aco|accountants|accountant|accenture|academy|abudhabi|abogado|able|abc|abbvie|abbott|abb|abarth|aarp|aaa|onion'; - $ccTLD = '한국|香港|澳門|新加坡|台灣|台湾|中國|中国|გე|ไทย|ලංකා|ഭാരതം|ಭಾರತ|భారత్|சிங்கப்பூர்|இலங்கை|இந்தியா|ଭାରତ|ભારત|ਭਾਰਤ|ভাৰত|ভারত|বাংলা|भारोत|भारतम्|भारत|ڀارت|پاکستان|مليسيا|مصر|قطر|فلسطين|عمان|عراق|سورية|سودان|تونس|بھارت|بارت|ایران|امارات|المغرب|السعودية|الجزائر|الاردن|հայ|қаз|укр|срб|рф|мон|мкд|ею|бел|бг|ελ|zw|zm|za|yt|ye|ws|wf|vu|vn|vi|vg|ve|vc|va|uz|uy|us|um|uk|ug|ua|tz|tw|tv|tt|tr|tp|to|tn|tm|tl|tk|tj|th|tg|tf|td|tc|sz|sy|sx|sv|su|st|ss|sr|so|sn|sm|sl|sk|sj|si|sh|sg|se|sd|sc|sb|sa|rw|ru|rs|ro|re|qa|py|pw|pt|ps|pr|pn|pm|pl|pk|ph|pg|pf|pe|pa|om|nz|nu|nr|np|no|nl|ni|ng|nf|ne|nc|na|mz|my|mx|mw|mv|mu|mt|ms|mr|mq|mp|mo|mn|mm|ml|mk|mh|mg|mf|me|md|mc|ma|ly|lv|lu|lt|ls|lr|lk|li|lc|lb|la|kz|ky|kw|kr|kp|kn|km|ki|kh|kg|ke|jp|jo|jm|je|it|is|ir|iq|io|in|im|il|ie|id|hu|ht|hr|hn|hm|hk|gy|gw|gu|gt|gs|gr|gq|gp|gn|gm|gl|gi|gh|gg|gf|ge|gd|gb|ga|fr|fo|fm|fk|fj|fi|eu|et|es|er|eh|eg|ee|ec|dz|do|dm|dk|dj|de|cz|cy|cx|cw|cv|cu|cr|co|cn|cm|cl|ck|ci|ch|cg|cf|cd|cc|ca|bz|by|bw|bv|bt|bs|br|bq|bo|bn|bm|bl|bj|bi|bh|bg|bf|be|bd|bb|ba|az|ax|aw|au|at|as|ar|aq|ao|an|am|al|ai|ag|af|ae|ad|ac'; - - $tmp['valid_gTLD'] = '(?:(?:' . $gTLD . ')(?=[^0-9a-z@]|$))'; - $tmp['valid_ccTLD'] = '(?:(?:' . $ccTLD . ')(?=[^0-9a-z@]|$))'; + $tmp['valid_gTLD'] = TldLists::getValidGTLD(); + $tmp['valid_ccTLD'] = TldLists::getValidCcTLD(); $tmp['valid_special_ccTLD'] = '(?:(?:' . 'co|tv' . ')(?=[^0-9a-z@]|$))'; $tmp['valid_punycode'] = '(?:xn--[0-9a-z]+)'; diff --git a/lib/Twitter/Text/TldLists.php b/lib/Twitter/Text/TldLists.php new file mode 100644 index 0000000..ccd9680 --- /dev/null +++ b/lib/Twitter/Text/TldLists.php @@ -0,0 +1,1636 @@ +assertStringStartsWith('(?:(?:삼성|닷컴|', $regexp); + $this->assertStringEndsWith('|aaa|onion)(?=[^0-9a-z@]|$))', $regexp); + + $regexpCached = TldLists::getValidGTLD(); + $this->assertSame($regexp, $regexpCached); + } + + /** + * @covers Twitter\Text\TldLists::getValidCcTLD + */ + public function testGetValidCcTLD() + { + $regexp = TldLists::getValidCcTLD(); + $this->assertStringStartsWith('(?:(?:한국|香港|', $regexp); + $this->assertStringEndsWith('|ad|ac)(?=[^0-9a-z@]|$))', $regexp); + + $regexpCached = TldLists::getValidCcTLD(); + $this->assertSame($regexp, $regexpCached); + } +} From 4d4a9759e8a6bb4738c5c337a3ca0468ecfc2c4f Mon Sep 17 00:00:00 2001 From: nojimage Date: Mon, 18 Dec 2017 20:42:05 +0900 Subject: [PATCH 02/10] replace rtl_chars matcher to static method --- lib/Twitter/Text/Autolink.php | 2 +- lib/Twitter/Text/LooseAutolink.php | 2 +- lib/Twitter/Text/Regex.php | 46 +++++++++++++++++++++--------- tests/Twitter/Text/RegexTest.php | 25 ++++++++++++++++ 4 files changed, 59 insertions(+), 16 deletions(-) create mode 100644 tests/Twitter/Text/RegexTest.php diff --git a/lib/Twitter/Text/Autolink.php b/lib/Twitter/Text/Autolink.php index bff6e14..b25fe49 100644 --- a/lib/Twitter/Text/Autolink.php +++ b/lib/Twitter/Text/Autolink.php @@ -650,7 +650,7 @@ public function linkToHashtag($entity, $tweet = null) if (!empty($this->class_hash)) { $class[] = $this->class_hash; } - if (preg_match(self::$patterns['rtl_chars'], $linkText)) { + if (preg_match(Regex::getRtlCharsMatcher(), $linkText)) { $class[] = 'rtl'; } if (!empty($class)) { diff --git a/lib/Twitter/Text/LooseAutolink.php b/lib/Twitter/Text/LooseAutolink.php index cc4905c..15cc2bd 100644 --- a/lib/Twitter/Text/LooseAutolink.php +++ b/lib/Twitter/Text/LooseAutolink.php @@ -269,7 +269,7 @@ protected function _addLinksToHashtags($matches) $element = $hash . $tag; $url = $this->url_base_hash . $tag; $class_hash = $this->class_hash; - if (preg_match(self::$patterns['rtl_chars'], $element)) { + if (preg_match(Regex::getRtlCharsMatcher(), $element)) { $class_hash .= ' rtl'; } $replacement .= $this->wrapHash($url, $class_hash, $element); diff --git a/lib/Twitter/Text/Regex.php b/lib/Twitter/Text/Regex.php index 83c88ab..c03ea17 100644 --- a/lib/Twitter/Text/Regex.php +++ b/lib/Twitter/Text/Regex.php @@ -27,7 +27,7 @@ * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0 * @package Twitter */ -abstract class Regex +class Regex { /** @@ -45,6 +45,20 @@ abstract class Regex */ protected $tweet = ''; + /** + * Expression to match RTL characters. + * + * 0x0600-0x06FF Arabic + * 0x0750-0x077F Arabic Supplement + * 0x08A0-0x08FF Arabic Extended-A + * 0x0590-0x05FF Hebrew + * 0xFB50-0xFDFF Arabic Presentation Forms-A + * 0xFE70-0xFEFF Arabic Presentation Forms-B + * + * @var string + */ + private static $rtlChars = '\x{0600}-\x{06ff}\x{0750}-\x{077f}\x{08a0}-\x{08ff}\x{0590}-\x{05ff}\x{fb50}-\x{fdff}\x{fe70}-\x{feff}'; + /** * This constructor is used to populate some variables. * @@ -122,16 +136,6 @@ public static function __static() $tmp['latin_accents'] .= '\x{0100}-\x{024f}\x{0253}-\x{0254}\x{0256}-\x{0257}'; $tmp['latin_accents'] .= '\x{0259}\x{025b}\x{0263}\x{0268}\x{026f}\x{0272}\x{0289}\x{028b}\x{02bb}\x{0300}-\x{036f}\x{1e00}-\x{1eff}'; - # Expression to match RTL characters. - # - # 0x0600-0x06FF Arabic - # 0x0750-0x077F Arabic Supplement - # 0x08A0-0x08FF Arabic Extended-A - # 0x0590-0x05FF Hebrew - # 0xFB50-0xFDFF Arabic Presentation Forms-A - # 0xFE70-0xFEFF Arabic Presentation Forms-B - $tmp['rtl_chars'] = '\x{0600}-\x{06ff}\x{0750}-\x{077f}\x{08a0}-\x{08ff}\x{0590}-\x{05ff}\x{fb50}-\x{fdff}\x{fe70}-\x{feff}'; - $tmp['hashtag_letters'] = '\p{L}\p{M}'; $tmp['hashtag_numerals'] = '\p{Nd}'; # Hashtag special chars @@ -329,12 +333,26 @@ public static function __static() $re['invalid_characters'] = '/[' . $tmp['invalid_characters'] . ']/u'; - $re['rtl_chars'] = '/[' . $tmp['rtl_chars'] . ']/iu'; - # Flag that initialization is complete: $initialized = true; } -} + /** + * Regexp to match RTL characters. + * + * @staticvar string $regexp + * @return string + */ + public static function getRtlCharsMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/[' . static::$rtlChars . ']/iu'; + } + + return $regexp; + } +} # Cause regular expressions to be initialized as soon as this file is loaded: Regex::__static(); diff --git a/tests/Twitter/Text/RegexTest.php b/tests/Twitter/Text/RegexTest.php new file mode 100644 index 0000000..afc9e58 --- /dev/null +++ b/tests/Twitter/Text/RegexTest.php @@ -0,0 +1,25 @@ +assertStringStartsWith('/[', $matcher); + $this->assertStringEndsWith(']/iu', $matcher); + + $matcherCached = Regex::getRtlCharsMatcher(); + $this->assertSame($matcher, $matcherCached); + } +} From 86387a5e95acce291eeedde7c15cd4cae449c5a5 Mon Sep 17 00:00:00 2001 From: nojimage Date: Mon, 18 Dec 2017 20:51:17 +0900 Subject: [PATCH 03/10] replase invalid_characters matcher to static method --- lib/Twitter/Text/Regex.php | 42 +++++++++++++++++++++++--------- lib/Twitter/Text/Validator.php | 2 +- tests/Twitter/Text/RegexTest.php | 13 ++++++++++ 3 files changed, 44 insertions(+), 13 deletions(-) diff --git a/lib/Twitter/Text/Regex.php b/lib/Twitter/Text/Regex.php index c03ea17..03d30ad 100644 --- a/lib/Twitter/Text/Regex.php +++ b/lib/Twitter/Text/Regex.php @@ -45,6 +45,15 @@ class Regex */ protected $tweet = ''; + /** + * Invalid Characters + * + * 0xFFFE,0xFEFF # BOM + * 0xFFFF # Special + * 0x202A-0x202E # Directional change + */ + private static $invalidCharacters = '\x{202a}-\x{202e}\x{feff}\x{fffe}\x{ffff}'; + /** * Expression to match RTL characters. * @@ -100,12 +109,6 @@ public static function __static() # 0x3000 Zs # IDEOGRAPHIC SPACE $tmp['spaces'] = '\x{0009}-\x{000D}\x{0020}\x{0085}\x{00a0}\x{1680}\x{180E}\x{2000}-\x{200a}\x{2028}\x{2029}\x{202f}\x{205f}\x{3000}'; - # Invalid Characters: - # 0xFFFE,0xFEFF # BOM - # 0xFFFF # Special - # 0x202A-0x202E # Directional change - $tmp['invalid_characters'] = '\x{202a}-\x{202e}\x{feff}\x{fffe}\x{ffff}'; - # Expression to match at and hash sign characters: $tmp['at_signs'] = '@@'; $tmp['hash_signs'] = '##'; @@ -158,7 +161,7 @@ public static function __static() # 0x0f0c TIBETAN MARK DELIMITER TSHEG BSTAR # 0x00b7 MIDDLE DOT $tmp['hashtag_special_chars'] = '_\x{200c}\x{200d}\x{a67e}\x{05be}\x{05f3}\x{05f4}\x{ff5e}\x{301c}\x{309b}\x{309c}\x{30a0}\x{30fb}\x{3003}\x{0f0b}\x{0f0c}\x{00b7}'; - $tmp['hashtag_letters_numerals_set'] = '[' . $tmp['hashtag_letters'] . $tmp['hashtag_numerals'] . $tmp['hashtag_special_chars'] . ']'; + $tmp['hashtag_letters_numerals_set'] = '[' . $tmp['hashtag_letters'] . $tmp['hashtag_numerals'] . $tmp['hashtag_special_chars'] . ']'; $tmp['hashtag_letters_set'] = '[' . $tmp['hashtag_letters'] . ']'; $tmp['hashtag_boundary'] = '(?:\A|\x{fe0e}|\x{fe0f}|[^&' . $tmp['hashtag_letters'] . $tmp['hashtag_numerals'] . $tmp['hashtag_special_chars'] . '])'; $tmp['hashtag'] = '(' . $tmp['hashtag_boundary'] . ')(#|\x{ff03})(?!\x{fe0f}|\x{20e3})(' . $tmp['hashtag_letters_numerals_set'] . '*' . $tmp['hashtag_letters_set'] . $tmp['hashtag_letters_numerals_set'] . '*)'; @@ -176,12 +179,12 @@ public static function __static() # URL related hash regex collection - $tmp['valid_url_preceding_chars'] = '(?:[^A-Z0-9_@@\$##' . $tmp['invalid_characters'] . ']|^)'; + $tmp['valid_url_preceding_chars'] = '(?:[^A-Z0-9_@@\$##' . static::$invalidCharacters . ']|^)'; $tmp['domain_valid_chars'] = '0-9a-z' . $tmp['latin_accents']; $tmp['valid_subdomain'] = '(?>(?:[' . $tmp['domain_valid_chars'] . '][' . $tmp['domain_valid_chars'] . '\-_]*)?[' . $tmp['domain_valid_chars'] . ']\.)'; $tmp['valid_domain_name'] = '(?:(?:[' . $tmp['domain_valid_chars'] . '][' . $tmp['domain_valid_chars'] . '\-]*)?[' . $tmp['domain_valid_chars'] . ']\.)'; - $tmp['domain_valid_unicode_chars'] = '[^\p{P}\p{Z}\p{C}' . $tmp['invalid_characters'] . $tmp['spaces'] . ']'; + $tmp['domain_valid_unicode_chars'] = '[^\p{P}\p{Z}\p{C}' . static::$invalidCharacters . $tmp['spaces'] . ']'; $tmp['valid_gTLD'] = TldLists::getValidGTLD(); $tmp['valid_ccTLD'] = TldLists::getValidCcTLD(); @@ -331,14 +334,29 @@ public static function __static() . '\#(.*)' # $5 Fragment . ')?$/iux'; - $re['invalid_characters'] = '/[' . $tmp['invalid_characters'] . ']/u'; - # Flag that initialization is complete: $initialized = true; } /** - * Regexp to match RTL characters. + * Get invalid characters matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getInvalidCharactersMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/[' . static::$invalidCharacters . ']/u'; + } + + return $regexp; + } + + /** + * Get RTL characters matcher * * @staticvar string $regexp * @return string diff --git a/lib/Twitter/Text/Validator.php b/lib/Twitter/Text/Validator.php index 650b206..429a887 100644 --- a/lib/Twitter/Text/Validator.php +++ b/lib/Twitter/Text/Validator.php @@ -178,7 +178,7 @@ public function isValidTweetText($tweet = null) if ($length > self::MAX_LENGTH) { return false; } - if (preg_match(self::$patterns['invalid_characters'], $tweet)) { + if (preg_match(Regex::getInvalidCharactersMatcher(), $tweet)) { return false; } return true; diff --git a/tests/Twitter/Text/RegexTest.php b/tests/Twitter/Text/RegexTest.php index afc9e58..201ea88 100644 --- a/tests/Twitter/Text/RegexTest.php +++ b/tests/Twitter/Text/RegexTest.php @@ -10,6 +10,19 @@ class RegexTest extends \PHPUnit_Framework_TestCase { + /** + * @covers Twitter\Text\Regex::getInvalidCharactersMatcher + */ + public function testGetInvalidCharactersMatcher() + { + $matcher = Regex::getInvalidCharactersMatcher(); + $this->assertStringStartsWith('/[', $matcher); + $this->assertStringEndsWith(']/u', $matcher); + + $matcherCached = Regex::getInvalidCharactersMatcher(); + $this->assertSame($matcher, $matcherCached); + } + /** * @covers Twitter\Text\Regex::getRtlCharsMatcher */ From e547a2f100dc9c750bc2680ed57a0aa43f68b186 Mon Sep 17 00:00:00 2001 From: nojimage Date: Mon, 18 Dec 2017 21:00:31 +0900 Subject: [PATCH 04/10] replace validate_url matchers to static method --- lib/Twitter/Text/Regex.php | 298 ++++++++++++++++++++++++------- lib/Twitter/Text/Validator.php | 21 ++- tests/Twitter/Text/RegexTest.php | 90 ++++++++++ 3 files changed, 341 insertions(+), 68 deletions(-) diff --git a/lib/Twitter/Text/Regex.php b/lib/Twitter/Text/Regex.php index 03d30ad..ffb9aef 100644 --- a/lib/Twitter/Text/Regex.php +++ b/lib/Twitter/Text/Regex.php @@ -68,6 +68,14 @@ class Regex */ private static $rtlChars = '\x{0600}-\x{06ff}\x{0750}-\x{077f}\x{08a0}-\x{08ff}\x{0590}-\x{05ff}\x{fb50}-\x{fdff}\x{fe70}-\x{feff}'; + # These URL validation pattern strings are based on the ABNF from RFC 3986 + private static $validateUrlUnreserved = '[a-z\p{Cyrillic}0-9\-._~]'; + private static $validateUrlPctEncoded = '(?:%[0-9a-f]{2})'; + private static $validateUrlSubDelims = '[!$&\'()*+,;=]'; + private static $validateUrlIpv4 = '(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})'; + private static $validateUrlIpv6 = '(?:\[[a-f0-9:\.]+\])'; + private static $validateUrlPort = '[0-9]{1,5}'; + /** * This constructor is used to populate some variables. * @@ -272,68 +280,6 @@ public static function __static() $re['valid_cashtag'] = '/(^|[' . $tmp['spaces'] . '])([' . $tmp['cash_signs'] . '])(' . $tmp['cashtag'] . ')(?=($|\s|[[:punct:]]))/iu'; $re['end_cashtag_match'] = '/\A(?:[' . $tmp['cash_signs'] . ']|:\/\/)/u'; - # These URL validation pattern strings are based on the ABNF from RFC 3986 - $tmp['validate_url_unreserved'] = '[a-z\p{Cyrillic}0-9\-._~]'; - $tmp['validate_url_pct_encoded'] = '(?:%[0-9a-f]{2})'; - $tmp['validate_url_sub_delims'] = '[!$&\'()*+,;=]'; - $tmp['validate_url_pchar'] = '(?:' . $tmp['validate_url_unreserved'] . '|' . $tmp['validate_url_pct_encoded'] . '|' . $tmp['validate_url_sub_delims'] . '|[:\|@])'; #/iox - - $tmp['validate_url_userinfo'] = '(?:' . $tmp['validate_url_unreserved'] . '|' . $tmp['validate_url_pct_encoded'] . '|' . $tmp['validate_url_sub_delims'] . '|:)*'; #/iox - - $tmp['validate_url_dec_octet'] = '(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])'; #/i - $tmp['validate_url_ipv4'] = '(?:' . $tmp['validate_url_dec_octet'] . '(?:\.' . $tmp['validate_url_dec_octet'] . '){3})'; #/iox - # Punting on real IPv6 validation for now - $tmp['validate_url_ipv6'] = '(?:\[[a-f0-9:\.]+\])'; #/i - # Also punting on IPvFuture for now - $tmp['validate_url_ip'] = '(?:' . $tmp['validate_url_ipv4'] . '|' . $tmp['validate_url_ipv6'] . ')'; #/iox - # This is more strict than the rfc specifies - $tmp['validate_url_subdomain_segment'] = '(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)'; #/i - $tmp['validate_url_domain_segment'] = '(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)'; #/i - $tmp['validate_url_domain_tld'] = '(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)'; #/i - $tmp['validate_url_domain'] = '(?:(?:' . $tmp['validate_url_subdomain_segment'] . '\.)*(?:' . $tmp['validate_url_domain_segment'] . '\.)' . $tmp['validate_url_domain_tld'] . ')'; #/iox - - $tmp['validate_url_host'] = '(?:' . $tmp['validate_url_ip'] . '|' . $tmp['validate_url_domain'] . ')'; #/iox - # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences - $tmp['validate_url_unicode_subdomain_segment'] = '(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)'; #/ix - $tmp['validate_url_unicode_domain_segment'] = '(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)'; #/ix - $tmp['validate_url_unicode_domain_tld'] = '(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)'; #/ix - $tmp['validate_url_unicode_domain'] = '(?:(?:' . $tmp['validate_url_unicode_subdomain_segment'] . '\.)*(?:' . $tmp['validate_url_unicode_domain_segment'] . '\.)' . $tmp['validate_url_unicode_domain_tld'] . ')'; #/iox - - $tmp['validate_url_unicode_host'] = '(?:' . $tmp['validate_url_ip'] . '|' . $tmp['validate_url_unicode_domain'] . ')'; #/iox - - $tmp['validate_url_port'] = '[0-9]{1,5}'; - - $re['validate_url_unicode_authority'] = '/' - . '(?:(' . $tmp['validate_url_userinfo'] . ')@)?' # $1 userinfo - . '(' . $tmp['validate_url_unicode_host'] . ')' # $2 host - . '(?::(' . $tmp['validate_url_port'] . '))?' # $3 port - . '/iux'; - - $re['validate_url_authority'] = '/' - . '(?:(' . $tmp['validate_url_userinfo'] . ')@)?' # $1 userinfo - . '(' . $tmp['validate_url_host'] . ')' # $2 host - . '(?::(' . $tmp['validate_url_port'] . '))?' # $3 port - . '/ix'; - - $re['validate_url_scheme'] = '/(?:[a-z][a-z0-9+\-.]*)/i'; - $re['validate_url_path'] = '/(\/' . $tmp['validate_url_pchar'] . '*)*/iu'; - $re['validate_url_query'] = '/(' . $tmp['validate_url_pchar'] . '|\/|\?)*/iu'; - $re['validate_url_fragment'] = '/(' . $tmp['validate_url_pchar'] . '|\/|\?)*/iu'; - - # Modified version of RFC 3986 Appendix B - $re['validate_url_unencoded'] = '/^' # Full URL - . '(?:' - . '([^:\/?#]+):\/\/' # $1 Scheme - . ')?' - . '([^\/?#]*)' # $2 Authority - . '([^?#]*)' # $3 Path - . '(?:' - . '\?([^#]*)' # $4 Query - . ')?' - . '(?:' - . '\#(.*)' # $5 Fragment - . ')?$/iux'; - # Flag that initialization is complete: $initialized = true; } @@ -371,6 +317,234 @@ public static function getRtlCharsMatcher() return $regexp; } + + /** + * Get url matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getValidateUrlUnencodedMatcher() + { + static $regexp = null; + + if ($regexp === null) { + # Modified version of RFC 3986 Appendix B + $regexp = '/\A' # Full URL + . '(?:' + . '([^:\/?#]+):\/\/' # $1 Scheme + . ')?' + . '([^\/?#]*)' # $2 Authority + . '([^?#]*)' # $3 Path + . '(?:' + . '\?([^#]*)' # $4 Query + . ')?' + . '(?:' + . '\#(.*)' # $5 Fragment + . ')?\z/iux'; + } + + return $regexp; + } + + /** + * Get valid url ip + * + * @return string matcher + */ + private static function getValidateUrlIp() + { + return '(?:' . static::$validateUrlIpv4 . '|' . static::$validateUrlIpv6 . ')'; #/iox + } + + /** + * Get valid url domain + * + * @return string matcher + */ + private static function getValidateUrlDomain() + { + $subdomain = '(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)'; #/i + $domain = '(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)'; #/i + $tld = '(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)'; #/i + + return '(?:(?:' . $subdomain . '\.)*(?:' . $domain . '\.)' . $tld . ')'; #/iox + } + + /** + * Get valid url host + * + * @return string matcher + */ + private static function getValidateUrlHost() + { + return '(?:' . static::getValidateUrlIp() . '|' . static::getValidateUrlDomain() . ')'; #/iox + } + + /** + * Get valid url unicode domain + * + * @return string matcher + */ + private static function getValidateUrlUnicodeDomain() + { + $subdomain = '(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)'; #/ix + $domain = '(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)'; #/ix + $tld = '(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)'; #/ix + + return '(?:(?:' . $subdomain . '\.)*(?:' . $domain . '\.)' . $tld . ')'; #/iox + } + + /** + * Get valid url unicode host + * + * @return string matcher + */ + private static function getValidateUrlUnicodeHost() + { + return '(?:' . static::getValidateUrlIp() . '|' . static::getValidateUrlUnicodeDomain() . ')'; #/iox + } + + /** + * Get valid url userinfo + * + * @return string matcher + */ + private static function getValidateUrlUserinfo() + { + return '(?:' . static::$validateUrlUnreserved + . '|' . static::$validateUrlPctEncoded + . '|' . static::$validateUrlSubDelims + . '|:)*'; #/iox + } + + /** + * Get url unicode authority matcher + * + * Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences + * + * @staticvar string $regexp + * @return string + */ + public static function getValidateUrlUnicodeAuthorityMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/' + . '(?:(' . static::getValidateUrlUserinfo() . ')@)?' # $1 userinfo + . '(' . static::getValidateUrlUnicodeHost() . ')' # $2 host + . '(?::(' . static::$validateUrlPort . '))?' # $3 port + . '/iux'; + } + + return $regexp; + } + + /** + * Get url authority matcher + * + * This is more strict than the rfc specifies + * + * @staticvar string $regexp + * @return string + */ + public static function getValidateUrlAuthorityMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/' + . '(?:(' . static::getValidateUrlUserinfo() . ')@)?' # $1 userinfo + . '(' . static::getValidateUrlHost() . ')' # $2 host + . '(?::(' . static::$validateUrlPort . '))?' # $3 port + . '/ix'; + } + + return $regexp; + } + + /** + * Get url scheme matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getValidateUrlSchemeMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/(?:[a-z][a-z0-9+\-.]*)/i'; + } + + return $regexp; + } + + /** + * Get valid url charactors + * + * @return string matcher + */ + private static function getValidateUrlPchar() + { + return '(?:' . static::$validateUrlUnreserved + . '|' . static::$validateUrlPctEncoded + . '|' . static::$validateUrlSubDelims + . '|[:\|@])'; #/iox + } + + /** + * Get url path matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getValidateUrlPathMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/(\/' . static::getValidateUrlPchar() . '*)*/iu'; + } + + return $regexp; + } + + /** + * Get url query matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getValidateUrlQueryMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/(' . static::getValidateUrlPchar() . '|\/|\?)*/iu'; + } + + return $regexp; + } + + /** + * Get url flagment matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getValidateUrlFragmentMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/(' . static::getValidateUrlPchar() . '|\/|\?)*/iu'; + } + + return $regexp; + } } + # Cause regular expressions to be initialized as soon as this file is loaded: Regex::__static(); diff --git a/lib/Twitter/Text/Validator.php b/lib/Twitter/Text/Validator.php index 429a887..80f35cd 100644 --- a/lib/Twitter/Text/Validator.php +++ b/lib/Twitter/Text/Validator.php @@ -300,26 +300,35 @@ public function isValidURL($url = null, $unicode_domains = true, $require_protoc if (is_null($url)) { $url = $this->tweet; } + $length = StringUtils::strlen($url); if (empty($url) || !$length) { return false; } - preg_match(self::$patterns['validate_url_unencoded'], $url, $matches); + + preg_match(Regex::getValidateUrlUnencodedMatcher(), $url, $matches); $match = array_shift($matches); if (!$matches || $match !== $url) { return false; } + list($scheme, $authority, $path, $query, $fragment) = array_pad($matches, 5, ''); + # Check scheme, path, query, fragment: if (($require_protocol && !( - self::isValidMatch($scheme, self::$patterns['validate_url_scheme']) && preg_match('/^https?$/i', $scheme)) - ) || !self::isValidMatch($path, self::$patterns['validate_url_path']) || !self::isValidMatch($query, self::$patterns['validate_url_query'], true) - || !self::isValidMatch($fragment, self::$patterns['validate_url_fragment'], true)) { + self::isValidMatch($scheme, Regex::getValidateUrlSchemeMatcher()) + && preg_match('/^https?$/i', $scheme) + )) + || !self::isValidMatch($path, Regex::getValidateUrlPathMatcher()) + || !self::isValidMatch($query, Regex::getValidateUrlQueryMatcher(), true) + || !self::isValidMatch($fragment, Regex::getValidateUrlFragmentMatcher(), true)) { return false; } + # Check authority: - $authority_pattern = $unicode_domains ? 'validate_url_unicode_authority' : 'validate_url_authority'; - return self::isValidMatch($authority, self::$patterns[$authority_pattern]); + $authorityPattern = $unicode_domains ? Regex::getValidateUrlUnicodeAuthorityMatcher() : Regex::getValidateUrlAuthorityMatcher(); + + return self::isValidMatch($authority, $authorityPattern); } /** diff --git a/tests/Twitter/Text/RegexTest.php b/tests/Twitter/Text/RegexTest.php index 201ea88..6c9c7fe 100644 --- a/tests/Twitter/Text/RegexTest.php +++ b/tests/Twitter/Text/RegexTest.php @@ -35,4 +35,94 @@ public function testGetRtlCharsMatcher() $matcherCached = Regex::getRtlCharsMatcher(); $this->assertSame($matcher, $matcherCached); } + + /** + * @covers Twitter\Text\Regex::getValidateUrlUnencodedMatcher + */ + public function testGetValidateUrlUnencodedMatcher() + { + $matcher = Regex::getValidateUrlUnencodedMatcher(); + $this->assertStringStartsWith('/\A(?:', $matcher); + $this->assertStringEndsWith(')?\z/iux', $matcher); + + $matcherCached = Regex::getValidateUrlUnencodedMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getValidateUrlUnicodeAuthorityMatcher + */ + public function testGetValidateUrlUnicodeAuthorityMatcher() + { + $matcher = Regex::getValidateUrlUnicodeAuthorityMatcher(); + $this->assertStringStartsWith('/(?:', $matcher); + $this->assertStringEndsWith(')?/iux', $matcher); + + $matcherCached = Regex::getValidateUrlUnicodeAuthorityMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getValidateUrlAuthorityMatcher + */ + public function testGetValidateUrlAuthorityMatcher() + { + $matcher = Regex::getValidateUrlAuthorityMatcher(); + $this->assertStringStartsWith('/(?:', $matcher); + $this->assertStringEndsWith(')?/ix', $matcher); + + $matcherCached = Regex::getValidateUrlAuthorityMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getValidateUrlSchemeMatcher + */ + public function testGetValidateUrlSchemeMatcher() + { + $matcher = Regex::getValidateUrlSchemeMatcher(); + $this->assertSame('/(?:[a-z][a-z0-9+\-.]*)/i', $matcher); + + $matcherCached = Regex::getValidateUrlSchemeMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getValidateUrlPathMatcher + */ + public function testGetValidateUrlPathMatcher() + { + $matcher = Regex::getValidateUrlPathMatcher(); + $this->assertStringStartsWith('/(', $matcher); + $this->assertStringEndsWith(')*/iu', $matcher); + + $matcherCached = Regex::getValidateUrlPathMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getValidateUrlQueryMatcher + */ + public function testGetValidateUrlQueryMatcher() + { + $matcher = Regex::getValidateUrlQueryMatcher(); + $this->assertStringStartsWith('/(', $matcher); + $this->assertStringEndsWith(')*/iu', $matcher); + + $matcherCached = Regex::getValidateUrlQueryMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getValidateUrlFragmentMatcher + */ + public function testGetValidateUrlFragmentMatcher() + { + $matcher = Regex::getValidateUrlFragmentMatcher(); + $this->assertStringStartsWith('/(', $matcher); + $this->assertStringEndsWith(')*/iu', $matcher); + + $matcherCached = Regex::getValidateUrlFragmentMatcher(); + $this->assertSame($matcher, $matcherCached); + } } From 1e3373d4903d40aec3614db872ad108d152056a1 Mon Sep 17 00:00:00 2001 From: nojimage Date: Tue, 19 Dec 2017 18:58:33 +0900 Subject: [PATCH 05/10] replace cashtag matchers to static method --- lib/Twitter/Text/Extractor.php | 2 +- lib/Twitter/Text/LooseAutolink.php | 4 +- lib/Twitter/Text/Regex.php | 100 ++++++++++++++++++++--------- tests/Twitter/Text/RegexTest.php | 26 ++++++++ 4 files changed, 98 insertions(+), 34 deletions(-) diff --git a/lib/Twitter/Text/Extractor.php b/lib/Twitter/Text/Extractor.php index 0b1c3af..8cbefda 100644 --- a/lib/Twitter/Text/Extractor.php +++ b/lib/Twitter/Text/Extractor.php @@ -296,7 +296,7 @@ public function extractCashtagsWithIndices($tweet = null) return array(); } - preg_match_all(self::$patterns['valid_cashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); + preg_match_all(Regex::getValidCashtagMatcher(), $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); $tags = array(); foreach ($matches as $match) { diff --git a/lib/Twitter/Text/LooseAutolink.php b/lib/Twitter/Text/LooseAutolink.php index 15cc2bd..418f436 100644 --- a/lib/Twitter/Text/LooseAutolink.php +++ b/lib/Twitter/Text/LooseAutolink.php @@ -148,7 +148,7 @@ public function addLinksToHashtags() public function addLinksToCashtags() { return preg_replace_callback( - self::$patterns['valid_cashtag'], + Regex::getValidCashtagMatcher(), array($this, '_addLinksToCashtags'), $this->tweet ); @@ -286,7 +286,7 @@ protected function _addLinksToHashtags($matches) protected function _addLinksToCashtags($matches) { list($all, $before, $cash, $tag, $after) = array_pad($matches, 5, ''); - if (preg_match(self::$patterns['end_cashtag_match'], $after) + if (preg_match(Regex::getEndCashtagMatcher(), $after) || (!preg_match('!\A["\']!', $before) && preg_match('!\A["\']!', $after)) || preg_match('!\A.. + * 0x0020 Zs # SPACE + * 0x0085 Cc # + * 0x00A0 Zs # NO-BREAK SPACE + * 0x1680 Zs # OGHAM SPACE MARK + * 0x180E Zs # MONGOLIAN VOWEL SEPARATOR + * 0x2000-0x200A Zs # EN QUAD..HAIR SPACE + * 0x2028 Zl # LINE SEPARATOR + * 0x2029 Zp # PARAGRAPH SEPARATOR + * 0x202F Zs # NARROW NO-BREAK SPACE + * 0x205F Zs # MEDIUM MATHEMATICAL SPACE + * 0x3000 Zs # IDEOGRAPHIC SPACE + * + * @var string + */ + # + private static $spaces = '\x{0009}-\x{000D}\x{0020}\x{0085}\x{00a0}\x{1680}\x{180E}\x{2000}-\x{200a}\x{2028}\x{2029}\x{202f}\x{205f}\x{3000}'; + /** * Invalid Characters * @@ -68,6 +89,10 @@ class Regex */ private static $rtlChars = '\x{0600}-\x{06ff}\x{0750}-\x{077f}\x{08a0}-\x{08ff}\x{0590}-\x{05ff}\x{fb50}-\x{fdff}\x{fe70}-\x{feff}'; + # cash tags + private static $cashSigns = '\$'; + private static $cashtag = '[a-z]{1,6}(?:[._][a-z]{1,2})?'; + # These URL validation pattern strings are based on the ABNF from RFC 3986 private static $validateUrlUnreserved = '[a-z\p{Cyrillic}0-9\-._~]'; private static $validateUrlPctEncoded = '(?:%[0-9a-f]{2})'; @@ -101,22 +126,6 @@ public static function __static() # Initialise local storage arrays: $tmp = array(); - # Expression to match whitespace characters. - # - # 0x0009-0x000D Cc # .. - # 0x0020 Zs # SPACE - # 0x0085 Cc # - # 0x00A0 Zs # NO-BREAK SPACE - # 0x1680 Zs # OGHAM SPACE MARK - # 0x180E Zs # MONGOLIAN VOWEL SEPARATOR - # 0x2000-0x200A Zs # EN QUAD..HAIR SPACE - # 0x2028 Zl # LINE SEPARATOR - # 0x2029 Zp # PARAGRAPH SEPARATOR - # 0x202F Zs # NARROW NO-BREAK SPACE - # 0x205F Zs # MEDIUM MATHEMATICAL SPACE - # 0x3000 Zs # IDEOGRAPHIC SPACE - $tmp['spaces'] = '\x{0009}-\x{000D}\x{0020}\x{0085}\x{00a0}\x{1680}\x{180E}\x{2000}-\x{200a}\x{2028}\x{2029}\x{202f}\x{205f}\x{3000}'; - # Expression to match at and hash sign characters: $tmp['at_signs'] = '@@'; $tmp['hash_signs'] = '##'; @@ -182,7 +191,7 @@ public static function __static() # look-ahead capture here and don't append $after when we return. $tmp['valid_mention_preceding_chars'] = '([^a-zA-Z0-9_!#\$%&*@@\/]|^|(?:^|[^a-z0-9_+~.-])RT:?)'; $re['valid_mentions_or_lists'] = '/' . $tmp['valid_mention_preceding_chars'] . '([' . $tmp['at_signs'] . '])([a-z0-9_]{1,20})(\/[a-z][a-z0-9_\-]{0,24})?(?=(.*|$))/iu'; - $re['valid_reply'] = '/^(?:[' . $tmp['spaces'] . '])*[' . $tmp['at_signs'] . ']([a-z0-9_]{1,20})(?=(.*|$))/iu'; + $re['valid_reply'] = '/^(?:[' . static::$spaces . '])*[' . $tmp['at_signs'] . ']([a-z0-9_]{1,20})(?=(.*|$))/iu'; $re['end_mention_match'] = '/\A(?:[' . $tmp['at_signs'] . ']|[' . $tmp['latin_accents'] . ']|:\/\/)/iu'; # URL related hash regex collection @@ -192,7 +201,7 @@ public static function __static() $tmp['domain_valid_chars'] = '0-9a-z' . $tmp['latin_accents']; $tmp['valid_subdomain'] = '(?>(?:[' . $tmp['domain_valid_chars'] . '][' . $tmp['domain_valid_chars'] . '\-_]*)?[' . $tmp['domain_valid_chars'] . ']\.)'; $tmp['valid_domain_name'] = '(?:(?:[' . $tmp['domain_valid_chars'] . '][' . $tmp['domain_valid_chars'] . '\-]*)?[' . $tmp['domain_valid_chars'] . ']\.)'; - $tmp['domain_valid_unicode_chars'] = '[^\p{P}\p{Z}\p{C}' . static::$invalidCharacters . $tmp['spaces'] . ']'; + $tmp['domain_valid_unicode_chars'] = '[^\p{P}\p{Z}\p{C}' . static::$invalidCharacters . static::$spaces . ']'; $tmp['valid_gTLD'] = TldLists::getValidGTLD(); $tmp['valid_ccTLD'] = TldLists::getValidCcTLD(); @@ -275,11 +284,6 @@ public static function __static() . ')' . ')/iux'; - $tmp['cash_signs'] = '\$'; - $tmp['cashtag'] = '[a-z]{1,6}(?:[._][a-z]{1,2})?'; - $re['valid_cashtag'] = '/(^|[' . $tmp['spaces'] . '])([' . $tmp['cash_signs'] . '])(' . $tmp['cashtag'] . ')(?=($|\s|[[:punct:]]))/iu'; - $re['end_cashtag_match'] = '/\A(?:[' . $tmp['cash_signs'] . ']|:\/\/)/u'; - # Flag that initialization is complete: $initialized = true; } @@ -318,6 +322,40 @@ public static function getRtlCharsMatcher() return $regexp; } + /** + * Get valid cachtag matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getValidCashtagMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/(^|[' . static::$spaces . '])([' . static::$cashSigns . '])(' . static::$cashtag . ')(?=($|\s|[[:punct:]]))/iu'; + } + + return $regexp; + } + + /** + * Get end of cachtag matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getEndCashtagMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/\A(?:[' . static::$cashSigns . ']|:\/\/)/u'; + } + + return $regexp; + } + /** * Get url matcher * @@ -432,10 +470,10 @@ public static function getValidateUrlUnicodeAuthorityMatcher() if ($regexp === null) { $regexp = '/' - . '(?:(' . static::getValidateUrlUserinfo() . ')@)?' # $1 userinfo - . '(' . static::getValidateUrlUnicodeHost() . ')' # $2 host - . '(?::(' . static::$validateUrlPort . '))?' # $3 port - . '/iux'; + . '(?:(' . static::getValidateUrlUserinfo() . ')@)?' # $1 userinfo + . '(' . static::getValidateUrlUnicodeHost() . ')' # $2 host + . '(?::(' . static::$validateUrlPort . '))?' # $3 port + . '/iux'; } return $regexp; @@ -455,10 +493,10 @@ public static function getValidateUrlAuthorityMatcher() if ($regexp === null) { $regexp = '/' - . '(?:(' . static::getValidateUrlUserinfo() . ')@)?' # $1 userinfo - . '(' . static::getValidateUrlHost() . ')' # $2 host - . '(?::(' . static::$validateUrlPort . '))?' # $3 port - . '/ix'; + . '(?:(' . static::getValidateUrlUserinfo() . ')@)?' # $1 userinfo + . '(' . static::getValidateUrlHost() . ')' # $2 host + . '(?::(' . static::$validateUrlPort . '))?' # $3 port + . '/ix'; } return $regexp; diff --git a/tests/Twitter/Text/RegexTest.php b/tests/Twitter/Text/RegexTest.php index 6c9c7fe..1ab2ff5 100644 --- a/tests/Twitter/Text/RegexTest.php +++ b/tests/Twitter/Text/RegexTest.php @@ -36,6 +36,32 @@ public function testGetRtlCharsMatcher() $this->assertSame($matcher, $matcherCached); } + /** + * @covers Twitter\Text\Regex::getValidCashtagMatcher + */ + public function testGetValidCashtagMatcher() + { + $matcher = Regex::getValidCashtagMatcher(); + $this->assertStringStartsWith('/(^|[', $matcher); + $this->assertStringEndsWith(']))/iu', $matcher); + + $matcherCached = Regex::getValidCashtagMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getEndCashtagMatcher + */ + public function testGetEndCashtagMatcher() + { + $matcher = Regex::getEndCashtagMatcher(); + $this->assertStringStartsWith('/\A(?:', $matcher); + $this->assertStringEndsWith(')/u', $matcher); + + $matcherCached = Regex::getEndCashtagMatcher(); + $this->assertSame($matcher, $matcherCached); + } + /** * @covers Twitter\Text\Regex::getValidateUrlUnencodedMatcher */ From 683c6f77ea31b8953efd76ded95d5aacc69da9ba Mon Sep 17 00:00:00 2001 From: nojimage Date: Tue, 19 Dec 2017 19:27:31 +0900 Subject: [PATCH 06/10] replace hashtag matchers to static method --- lib/Twitter/Text/Extractor.php | 6 +- lib/Twitter/Text/LooseAutolink.php | 4 +- lib/Twitter/Text/Regex.php | 120 ++++++++++++++++++++--------- tests/Twitter/Text/RegexTest.php | 26 +++++++ 4 files changed, 114 insertions(+), 42 deletions(-) diff --git a/lib/Twitter/Text/Extractor.php b/lib/Twitter/Text/Extractor.php index 8cbefda..8d7c464 100644 --- a/lib/Twitter/Text/Extractor.php +++ b/lib/Twitter/Text/Extractor.php @@ -243,7 +243,7 @@ public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = tru return array(); } - preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); + preg_match_all(Regex::getValidHashtagMatcher(), $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); $tags = array(); foreach ($matches as $match) { @@ -251,7 +251,7 @@ public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = tru $start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1]; $end_position = $start_position + StringUtils::strlen($hash[0] . $hashtag[0]); - if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) { + if (preg_match(Regex::getEndHashtagMatcher(), $outer[0])) { continue; } @@ -304,7 +304,7 @@ public function extractCashtagsWithIndices($tweet = null) $start_position = $dollar[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $dollar[1])) : $dollar[1]; $end_position = $start_position + StringUtils::strlen($dollar[0] . $cash_text[0]); - if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) { + if (preg_match(Regex::getEndHashtagMatcher(), $outer[0])) { continue; } diff --git a/lib/Twitter/Text/LooseAutolink.php b/lib/Twitter/Text/LooseAutolink.php index 418f436..aa564dc 100644 --- a/lib/Twitter/Text/LooseAutolink.php +++ b/lib/Twitter/Text/LooseAutolink.php @@ -134,7 +134,7 @@ public function addLinks() public function addLinksToHashtags() { return preg_replace_callback( - self::$patterns['valid_hashtag'], + Regex::getValidHashtagMatcher(), array($this, '_addLinksToHashtags'), $this->tweet ); @@ -261,7 +261,7 @@ protected function wrapHash($url, $class, $element) protected function _addLinksToHashtags($matches) { list($all, $before, $hash, $tag, $after) = array_pad($matches, 5, ''); - if (preg_match(self::$patterns['end_hashtag_match'], $after) + if (preg_match(Regex::getEndHashtagMatcher(), $after) || (!preg_match('!\A["\']!', $before) && preg_match('!\A["\']!', $after)) || preg_match('!\AassertSame($matcher, $matcherCached); } + /** + * @covers Twitter\Text\Regex::getValidHashtagMatcher + */ + public function testGetValidHashtagMatcher() + { + $matcher = Regex::getValidHashtagMatcher(); + $this->assertStringStartsWith('/((?:', $matcher); + $this->assertStringEndsWith('))/iu', $matcher); + + $matcherCached = Regex::getValidHashtagMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getEndHashtagMatcher + */ + public function testGetEndHashtagMatcher() + { + $matcher = Regex::getEndHashtagMatcher(); + $this->assertStringStartsWith('/\A(?:', $matcher); + $this->assertStringEndsWith(')/u', $matcher); + + $matcherCached = Regex::getEndHashtagMatcher(); + $this->assertSame($matcher, $matcherCached); + } + /** * @covers Twitter\Text\Regex::getValidCashtagMatcher */ From 14bb731f78da153fd92f25e1c82456673e23920d Mon Sep 17 00:00:00 2001 From: nojimage Date: Tue, 19 Dec 2017 19:55:50 +0900 Subject: [PATCH 07/10] replace mention matchers to static method --- lib/Twitter/Text/Extractor.php | 8 +- lib/Twitter/Text/LooseAutolink.php | 4 +- lib/Twitter/Text/Regex.php | 126 ++++++++++++++++++++--------- lib/Twitter/Text/Validator.php | 2 +- tests/Twitter/Text/RegexTest.php | 39 +++++++++ 5 files changed, 135 insertions(+), 44 deletions(-) diff --git a/lib/Twitter/Text/Extractor.php b/lib/Twitter/Text/Extractor.php index 8d7c464..c5c615b 100644 --- a/lib/Twitter/Text/Extractor.php +++ b/lib/Twitter/Text/Extractor.php @@ -205,9 +205,9 @@ public function extractReplyScreenname($tweet = null) if (is_null($tweet)) { $tweet = $this->tweet; } - $matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches); + $matched = preg_match(Regex::getValidReplyMatcher(), $tweet, $matches); # Check username ending in - if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) { + if ($matched && preg_match(Regex::getEndMentionMatcher(), $matches[2])) { $matched = false; } return $matched ? $matches[1] : null; @@ -453,7 +453,7 @@ public function extractMentionsOrListsWithIndices($tweet = null) return array(); } - preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); + preg_match_all(Regex::getValidMentionsOrListsMatcher(), $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); $results = array(); foreach ($matches as $match) { @@ -466,7 +466,7 @@ public function extractMentionsOrListsWithIndices($tweet = null) 'indices' => array($start_position, $end_position), ); - if (preg_match(self::$patterns['end_mention_match'], $outer[0])) { + if (preg_match(Regex::getEndMentionMatcher(), $outer[0])) { continue; } diff --git a/lib/Twitter/Text/LooseAutolink.php b/lib/Twitter/Text/LooseAutolink.php index aa564dc..d03da9c 100644 --- a/lib/Twitter/Text/LooseAutolink.php +++ b/lib/Twitter/Text/LooseAutolink.php @@ -172,7 +172,7 @@ public function addLinksToURLs() public function addLinksToUsernamesAndLists() { return preg_replace_callback( - self::$patterns['valid_mentions_or_lists'], + Regex::getValidMentionsOrListsMatcher(), array($this, '_addLinksToUsernamesAndLists'), $this->tweet ); @@ -331,7 +331,7 @@ protected function _addLinksToUsernamesAndLists($matches) $class = $this->class_list; $url = $this->url_base_list . $element; } else { - if (preg_match(self::$patterns['end_mention_match'], $after)) { + if (preg_match(Regex::getEndMentionMatcher(), $after)) { return $all; } # Replace the username diff --git a/lib/Twitter/Text/Regex.php b/lib/Twitter/Text/Regex.php index 04e847a..325a484 100644 --- a/lib/Twitter/Text/Regex.php +++ b/lib/Twitter/Text/Regex.php @@ -66,6 +66,34 @@ class Regex # private static $spaces = '\x{0009}-\x{000D}\x{0020}\x{0085}\x{00a0}\x{1680}\x{180E}\x{2000}-\x{200a}\x{2028}\x{2029}\x{202f}\x{205f}\x{3000}'; + /** + * Expression to match latin accented characters. + * + * 0x00C0-0x00D6 + * 0x00D8-0x00F6 + * 0x00F8-0x00FF + * 0x0100-0x024f + * 0x0253-0x0254 + * 0x0256-0x0257 + * 0x0259 + * 0x025b + * 0x0263 + * 0x0268 + * 0x026f + * 0x0272 + * 0x0289 + * 0x028b + * 0x02bb + * 0x0300-0x036f + * 0x1e00-0x1eff + * + * Excludes 0x00D7 - multiplication sign (confusable with 'x'). + * Excludes 0x00F7 - division sign. + * + * @var string + */ + private static $latinAccents = '\x{00c0}-\x{00d6}\x{00d8}-\x{00f6}\x{00f8}-\x{00ff}\x{0100}-\x{024f}\x{0253}-\x{0254}\x{0256}-\x{0257}\x{0259}\x{025b}\x{0263}\x{0268}\x{026f}\x{0272}\x{0289}\x{028b}\x{02bb}\x{0300}-\x{036f}\x{1e00}-\x{1eff}'; + /** * Invalid Characters * @@ -130,45 +158,11 @@ public static function __static() # Initialise local storage arrays: $tmp = array(); - # Expression to match latin accented characters. - # - # 0x00C0-0x00D6 - # 0x00D8-0x00F6 - # 0x00F8-0x00FF - # 0x0100-0x024f - # 0x0253-0x0254 - # 0x0256-0x0257 - # 0x0259 - # 0x025b - # 0x0263 - # 0x0268 - # 0x026f - # 0x0272 - # 0x0289 - # 0x028b - # 0x02bb - # 0x0300-0x036f - # 0x1e00-0x1eff - # - # Excludes 0x00D7 - multiplication sign (confusable with 'x'). - # Excludes 0x00F7 - division sign. - $tmp['latin_accents'] = '\x{00c0}-\x{00d6}\x{00d8}-\x{00f6}\x{00f8}-\x{00ff}'; - $tmp['latin_accents'] .= '\x{0100}-\x{024f}\x{0253}-\x{0254}\x{0256}-\x{0257}'; - $tmp['latin_accents'] .= '\x{0259}\x{025b}\x{0263}\x{0268}\x{026f}\x{0272}\x{0289}\x{028b}\x{02bb}\x{0300}-\x{036f}\x{1e00}-\x{1eff}'; - - # XXX: PHP doesn't have Ruby's $' (dollar apostrophe) so we have to capture - # $after in the following regular expression. Note that we only use a - # look-ahead capture here and don't append $after when we return. - $tmp['valid_mention_preceding_chars'] = '([^a-zA-Z0-9_!#\$%&*@@\/]|^|(?:^|[^a-z0-9_+~.-])RT:?)'; - $re['valid_mentions_or_lists'] = '/' . $tmp['valid_mention_preceding_chars'] . '([' . static::$atSigns . '])([a-z0-9_]{1,20})(\/[a-z][a-z0-9_\-]{0,24})?(?=(.*|$))/iu'; - $re['valid_reply'] = '/^(?:[' . static::$spaces . '])*[' . static::$atSigns . ']([a-z0-9_]{1,20})(?=(.*|$))/iu'; - $re['end_mention_match'] = '/\A(?:[' . static::$atSigns . ']|[' . $tmp['latin_accents'] . ']|:\/\/)/iu'; - # URL related hash regex collection $tmp['valid_url_preceding_chars'] = '(?:[^A-Z0-9_@@\$##' . static::$invalidCharacters . ']|^)'; - $tmp['domain_valid_chars'] = '0-9a-z' . $tmp['latin_accents']; + $tmp['domain_valid_chars'] = '0-9a-z' . static::$latinAccents; $tmp['valid_subdomain'] = '(?>(?:[' . $tmp['domain_valid_chars'] . '][' . $tmp['domain_valid_chars'] . '\-_]*)?[' . $tmp['domain_valid_chars'] . ']\.)'; $tmp['valid_domain_name'] = '(?:(?:[' . $tmp['domain_valid_chars'] . '][' . $tmp['domain_valid_chars'] . '\-]*)?[' . $tmp['domain_valid_chars'] . ']\.)'; $tmp['domain_valid_unicode_chars'] = '[^\p{P}\p{Z}\p{C}' . static::$invalidCharacters . static::$spaces . ']'; @@ -214,7 +208,7 @@ public static function __static() $tmp['valid_port_number'] = '[0-9]+'; - $tmp['valid_general_url_path_chars'] = '[a-z\p{Cyrillic}0-9!\*;:=\+\,\.\$\/%#\[\]\-_~&|@' . $tmp['latin_accents'] . ']'; + $tmp['valid_general_url_path_chars'] = '[a-z\p{Cyrillic}0-9!\*;:=\+\,\.\$\/%#\[\]\-_~&|@' . static::$latinAccents . ']'; # Allow URL paths to contain up to two nested levels of balanced parentheses: # 1. Used in Wikipedia URLs, e.g. /Primer_(film) # 2. Used in IIS sessions, e.g. /S(dfd346)/ @@ -232,7 +226,7 @@ public static function __static() . '\))'; # Valid end-of-path characters (so /foo. does not gobble the period). # 1. Allow =&# for empty URL parameters and other URL-join artifacts. - $tmp['valid_url_path_ending_chars'] = '[a-z\p{Cyrillic}0-9=_#\/\+\-' . $tmp['latin_accents'] . ']|(?:' . $tmp['valid_url_balanced_parens'] . ')'; + $tmp['valid_url_path_ending_chars'] = '[a-z\p{Cyrillic}0-9=_#\/\+\-' . static::$latinAccents . ']|(?:' . $tmp['valid_url_balanced_parens'] . ')'; $tmp['valid_url_path'] = '(?:(?:' . $tmp['valid_general_url_path_chars'] . '*(?:' . $tmp['valid_url_balanced_parens'] . ' ' @@ -294,6 +288,64 @@ public static function getRtlCharsMatcher() // ================================================================================================================= + # NOTE: PHP doesn't have Ruby's $' (dollar apostrophe) so we have to capture + # $after in the following regular expression. Note that we only use a + # look-ahead capture here and don't append $after when we return. + + /** + * Get valid mentions or lists matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getValidMentionsOrListsMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $mention_preceding_chars = '([^a-zA-Z0-9_!#\$%&*@@\/]|^|(?:^|[^a-z0-9_+~.-])RT:?)'; + $regexp = '/' . $mention_preceding_chars . '([' . static::$atSigns . '])([a-z0-9_]{1,20})(\/[a-z][a-z0-9_\-]{0,24})?(?=(.*|$))/iu'; + } + + return $regexp; + } + + /** + * Get valid hashtag matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getValidReplyMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/^(?:[' . static::$spaces . '])*[' . static::$atSigns . ']([a-z0-9_]{1,20})(?=(.*|$))/iu'; + } + + return $regexp; + } + + /** + * Get end of hashtag matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getEndMentionMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/\A(?:[' . static::$atSigns . ']|[' . static::$latinAccents . ']|:\/\/)/iu'; + } + + return $regexp; + } + + // ================================================================================================================= + /** * Get hashtag matcher * diff --git a/lib/Twitter/Text/Validator.php b/lib/Twitter/Text/Validator.php index 80f35cd..dbe79ee 100644 --- a/lib/Twitter/Text/Validator.php +++ b/lib/Twitter/Text/Validator.php @@ -240,7 +240,7 @@ public function isValidList($list = null) if (empty($list) || !$length) { return false; } - preg_match(self::$patterns['valid_mentions_or_lists'], $list, $matches); + preg_match(Regex::getValidMentionsOrListsMatcher(), $list, $matches); $matches = array_pad($matches, 5, ''); return isset($matches) && $matches[1] === '' && $matches[4] && !empty($matches[4]) && $matches[5] === ''; } diff --git a/tests/Twitter/Text/RegexTest.php b/tests/Twitter/Text/RegexTest.php index 19e6bb9..31999f2 100644 --- a/tests/Twitter/Text/RegexTest.php +++ b/tests/Twitter/Text/RegexTest.php @@ -36,6 +36,45 @@ public function testGetRtlCharsMatcher() $this->assertSame($matcher, $matcherCached); } + /** + * @covers Twitter\Text\Regex::getValidMentionsOrListsMatcher + */ + public function testGetValidMentionsOrListsMatcher() + { + $matcher = Regex::getValidMentionsOrListsMatcher(); + $this->assertStringStartsWith('/([', $matcher); + $this->assertStringEndsWith('(?=(.*|$))/iu', $matcher); + + $matcherCached = Regex::getValidMentionsOrListsMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getValidReplyMatcher + */ + public function testGetValidReplyMatcher() + { + $matcher = Regex::getValidReplyMatcher(); + $this->assertStringStartsWith('/^(?:[', $matcher); + $this->assertStringEndsWith('(?=(.*|$))/iu', $matcher); + + $matcherCached = Regex::getValidReplyMatcher(); + $this->assertSame($matcher, $matcherCached); + } + + /** + * @covers Twitter\Text\Regex::getEndMentionMatcher + */ + public function testGetEndMentionMatcher() + { + $matcher = Regex::getEndMentionMatcher(); + $this->assertStringStartsWith('/\A(?:', $matcher); + $this->assertStringEndsWith(')/iu', $matcher); + + $matcherCached = Regex::getEndMentionMatcher(); + $this->assertSame($matcher, $matcherCached); + } + /** * @covers Twitter\Text\Regex::getValidHashtagMatcher */ From bb88bea806c3f34be47986940e33d5939a5a13a7 Mon Sep 17 00:00:00 2001 From: nojimage Date: Thu, 28 Dec 2017 18:52:24 +0900 Subject: [PATCH 08/10] replace url matchers to static method --- lib/Twitter/Text/Extractor.php | 12 +- lib/Twitter/Text/LooseAutolink.php | 2 +- lib/Twitter/Text/Regex.php | 336 ++++++++++++++++++++--------- 3 files changed, 241 insertions(+), 109 deletions(-) diff --git a/lib/Twitter/Text/Extractor.php b/lib/Twitter/Text/Extractor.php index c5c615b..963d7d2 100644 --- a/lib/Twitter/Text/Extractor.php +++ b/lib/Twitter/Text/Extractor.php @@ -335,7 +335,7 @@ public function extractURLsWithIndices($tweet = null) } $urls = array(); - preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); + preg_match_all(Regex::getValidUrlMatcher(), $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); foreach ($matches as $match) { list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, array('')); @@ -354,14 +354,14 @@ public function extractURLsWithIndices($tweet = null) // If protocol is missing and domain contains non-ASCII characters, // extract ASCII-only domains. if (empty($protocol)) { - if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) { + if (!$this->extractURLWithoutProtocol || preg_match(Regex::getInvalidUrlWithoutProtocolPrecedingCharsMatcher(), $before)) { continue; } $last_url = null; $ascii_end_position = 0; - if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) { + if (preg_match(Regex::getValidAsciiDomainMatcher(), $domain, $asciiDomain)) { $asciiDomain[0] = preg_replace('/' . preg_quote($domain, '/') . '/u', $asciiDomain[0], $url); $ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position); $ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]); @@ -370,8 +370,8 @@ public function extractURLsWithIndices($tweet = null) 'indices' => array($start_position + $ascii_start_position, $start_position + $ascii_end_position), ); if (!empty($path) - || preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0]) - || !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) { + || preg_match(Regex::getValidSpecialShortDomainMatcher(), $asciiDomain[0]) + || !preg_match(Regex::getInvalidCharactersMatcher(), $asciiDomain[0])) { $urls[] = $last_url; } } @@ -389,7 +389,7 @@ public function extractURLsWithIndices($tweet = null) } } else { // In the case of t.co URLs, don't allow additional path characters - if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) { + if (preg_match(Regex::getValidTcoUrlMatcher(), $url, $tcoUrlMatches)) { $url = $tcoUrlMatches[0]; $end_position = $start_position + StringUtils::strlen($url); } diff --git a/lib/Twitter/Text/LooseAutolink.php b/lib/Twitter/Text/LooseAutolink.php index d03da9c..ca9000c 100644 --- a/lib/Twitter/Text/LooseAutolink.php +++ b/lib/Twitter/Text/LooseAutolink.php @@ -161,7 +161,7 @@ public function addLinksToCashtags() */ public function addLinksToURLs() { - return preg_replace_callback(self::$patterns['valid_url'], array($this, '_addLinksToURLs'), $this->tweet); + return preg_replace_callback(Regex::getValidUrlMatcher(), array($this, '_addLinksToURLs'), $this->tweet); } /** diff --git a/lib/Twitter/Text/Regex.php b/lib/Twitter/Text/Regex.php index 325a484..74121f6 100644 --- a/lib/Twitter/Text/Regex.php +++ b/lib/Twitter/Text/Regex.php @@ -133,6 +133,10 @@ class Regex private static $validateUrlIpv6 = '(?:\[[a-f0-9:\.]+\])'; private static $validateUrlPort = '[0-9]{1,5}'; + # URL related hash regex collection + private static $validSpecialCcTLD = '(?:(?:co|tv)(?=[^0-9a-z@]|$))'; + private static $validPunycode = '(?:xn--[0-9a-z]+)'; + /** * This constructor is used to populate some variables. * @@ -144,148 +148,279 @@ protected function __construct($tweet = null) } /** - * Emulate a static initialiser while PHP doesn't have one. + * Get invalid characters matcher + * + * @staticvar string $regexp + * @return string */ - public static function __static() + public static function getInvalidCharactersMatcher() { - # Check whether we have initialized the regular expressions: - static $initialized = false; - if ($initialized) { - return; + static $regexp = null; + + if ($regexp === null) { + $regexp = '/[' . static::$invalidCharacters . ']/u'; } - # Get a shorter reference to the regular expression array: - $re = & self::$patterns; - # Initialise local storage arrays: - $tmp = array(); - # URL related hash regex collection + return $regexp; + } - $tmp['valid_url_preceding_chars'] = '(?:[^A-Z0-9_@@\$##' . static::$invalidCharacters . ']|^)'; + /** + * Get RTL characters matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getRtlCharsMatcher() + { + static $regexp = null; - $tmp['domain_valid_chars'] = '0-9a-z' . static::$latinAccents; - $tmp['valid_subdomain'] = '(?>(?:[' . $tmp['domain_valid_chars'] . '][' . $tmp['domain_valid_chars'] . '\-_]*)?[' . $tmp['domain_valid_chars'] . ']\.)'; - $tmp['valid_domain_name'] = '(?:(?:[' . $tmp['domain_valid_chars'] . '][' . $tmp['domain_valid_chars'] . '\-]*)?[' . $tmp['domain_valid_chars'] . ']\.)'; - $tmp['domain_valid_unicode_chars'] = '[^\p{P}\p{Z}\p{C}' . static::$invalidCharacters . static::$spaces . ']'; + if ($regexp === null) { + $regexp = '/[' . static::$rtlChars . ']/iu'; + } - $tmp['valid_gTLD'] = TldLists::getValidGTLD(); - $tmp['valid_ccTLD'] = TldLists::getValidCcTLD(); - $tmp['valid_special_ccTLD'] = '(?:(?:' . 'co|tv' . ')(?=[^0-9a-z@]|$))'; - $tmp['valid_punycode'] = '(?:xn--[0-9a-z]+)'; + return $regexp; + } - $tmp['valid_domain'] = '' - // subdomains + domain + TLD - // e.g. www.twitter.com, foo.co.jp, bar.co.uk - . '(?:' . $tmp['valid_subdomain'] . '+' . $tmp['valid_domain_name'] - . '(?:' . $tmp['valid_gTLD'] . '|' . $tmp['valid_ccTLD'] . '|' . $tmp['valid_punycode'] . '))' - // domain + gTLD | protocol + unicode domain + gTLD - . '|(?:' - . '(?:' - . $tmp['valid_domain_name'] . '|(?:(?<=http:\/\/|https:\/\/)' . $tmp['domain_valid_unicode_chars'] . '+\.)' - . ')' - . $tmp['valid_gTLD'] - . ')' - // domain + gTLD | some ccTLD - // e.g. twitter.com - . '|(?:' . $tmp['valid_domain_name'] . $tmp['valid_punycode'] . ')' - . '|(?:' . $tmp['valid_domain_name'] . $tmp['valid_special_ccTLD'] . ')' - // protocol + domain + ccTLD | protocol + unicode domain + ccTLD - . '|(?:(?<=http:\/\/|https:\/\/)' - . '(?:' . $tmp['valid_domain_name'] . '|' . $tmp['domain_valid_unicode_chars'] . '+\.)' - . $tmp['valid_ccTLD'] . ')' - // domain + ccTLD + '/' - // e.g. t.co/ - . '|(?:' . $tmp['valid_domain_name'] . $tmp['valid_ccTLD'] . '(?=\/))'; - # Used by the extractor: - $re['valid_ascii_domain'] = '/' . $tmp['valid_subdomain'] . '*' . $tmp['valid_domain_name'] . '(?:' . $tmp['valid_gTLD'] . '|' . $tmp['valid_ccTLD'] . '|' . $tmp['valid_punycode'] . ')/iu'; + // ================================================================================================================= - # Used by the extractor for stricter t.co URL extraction: - $re['valid_tco_url'] = '/^https?:\/\/t\.co\/[a-z0-9]+/iu'; + /** + * Get valid ascii domain matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getValidAsciiDomainMatcher() + { + static $regexp = null; - # Used by the extractor to filter out unwanted URLs: - $re['invalid_short_domain'] = '/\A' . $tmp['valid_domain_name'] . $tmp['valid_ccTLD'] . '\Z/iu'; - $re['valid_special_short_domain'] = '/\A' . $tmp['valid_domain_name'] . $tmp['valid_special_ccTLD'] . '\Z/iu'; - $re['invalid_url_without_protocol_preceding_chars'] = '/[\-_.\/]\z/iu'; + if ($regexp === null) { + $regexp = '/' . static::getValidSubdomain() . '*' . static::getValidDomainName() + . '(?:' . TldLists::getValidGTLD() . '|' . TldLists::getValidCcTLD() + . '|' . static::$validPunycode . ')/iu'; + } - $tmp['valid_port_number'] = '[0-9]+'; + return $regexp; + } - $tmp['valid_general_url_path_chars'] = '[a-z\p{Cyrillic}0-9!\*;:=\+\,\.\$\/%#\[\]\-_~&|@' . static::$latinAccents . ']'; - # Allow URL paths to contain up to two nested levels of balanced parentheses: - # 1. Used in Wikipedia URLs, e.g. /Primer_(film) - # 2. Used in IIS sessions, e.g. /S(dfd346)/ - # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/ - $tmp['valid_url_balanced_parens'] = '(?:\(' - . '(?:' . $tmp['valid_general_url_path_chars'] . '+' - . '|' - // allow one nested level of balanced parentheses - . '(?:' - . $tmp['valid_general_url_path_chars'] . '*' - . '\(' . $tmp['valid_general_url_path_chars'] . '+' . '\)' - . $tmp['valid_general_url_path_chars'] . '*' - . ')' - . ')' - . '\))'; - # Valid end-of-path characters (so /foo. does not gobble the period). - # 1. Allow =&# for empty URL parameters and other URL-join artifacts. - $tmp['valid_url_path_ending_chars'] = '[a-z\p{Cyrillic}0-9=_#\/\+\-' . static::$latinAccents . ']|(?:' . $tmp['valid_url_balanced_parens'] . ')'; - $tmp['valid_url_path'] = '(?:(?:' - . $tmp['valid_general_url_path_chars'] . '*(?:' - . $tmp['valid_url_balanced_parens'] . ' ' - . $tmp['valid_general_url_path_chars'] . '*)*' - . $tmp['valid_url_path_ending_chars'] . ')|(?:@' - . $tmp['valid_general_url_path_chars'] . '+\/))'; - - $tmp['valid_url_query_chars'] = '[a-z0-9!?\*\'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]'; - $tmp['valid_url_query_ending_chars'] = '[a-z0-9_&=#\/\-]'; - - $re['valid_url'] = '/(?:' # $1 Complete match (preg_match() already matches everything.) - . '(' . $tmp['valid_url_preceding_chars'] . ')' # $2 Preceding characters - . '(' # $3 Complete URL - . '(https?:\/\/)?' # $4 Protocol (optional) - . '(' . $tmp['valid_domain'] . ')' # $5 Domain(s) - . '(?::(' . $tmp['valid_port_number'] . '))?' # $6 Port number (optional) - . '(\/' . $tmp['valid_url_path'] . '*)?' # $7 URL Path - . '(\?' . $tmp['valid_url_query_chars'] . '*' . $tmp['valid_url_query_ending_chars'] . ')?' # $8 Query String - . ')' - . ')/iux'; + /** + * Get valid tco url matcher + * + * Used by the extractor for stricter t.co URL extraction + * + * @staticvar string $regexp + * @return string + */ + public static function getValidTcoUrlMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/^https?:\/\/t\.co\/[a-z0-9]+/iu'; + } - # Flag that initialization is complete: - $initialized = true; + return $regexp; } /** - * Get invalid characters matcher + * Get invalid short domain matcher * * @staticvar string $regexp * @return string */ - public static function getInvalidCharactersMatcher() + public static function getInvalidShortDomainMatcher() { static $regexp = null; if ($regexp === null) { - $regexp = '/[' . static::$invalidCharacters . ']/u'; + $regexp = '/\A' . static::getValidDomainName() . TldLists::getValidCcTLD() . '\Z/iu'; } return $regexp; } /** - * Get RTL characters matcher + * Get valid special short domain matcher * * @staticvar string $regexp * @return string */ - public static function getRtlCharsMatcher() + public static function getValidSpecialShortDomainMatcher() { static $regexp = null; if ($regexp === null) { - $regexp = '/[' . static::$rtlChars . ']/iu'; + $regexp = '/\A' . static::getValidDomainName() . static::$validSpecialCcTLD . '\Z/iu'; + } + + return $regexp; + } + + /** + * Get invalid url without protocol preceding chars matcher + * + * @staticvar string $regexp + * @return string + */ + public static function getInvalidUrlWithoutProtocolPrecedingCharsMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $regexp = '/[\-_.\/]\z/iu'; } return $regexp; } + /** + * Get valid url + * + * @staticvar string $regexp + * @return string + */ + public static function getValidUrlMatcher() + { + static $regexp = null; + + if ($regexp === null) { + $validUrlPrecedingChars = '(?:[^A-Z0-9_@@\$##' . static::$invalidCharacters . ']|^)'; + $validUrlQueryChars = '[a-z0-9!?\*\'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]'; + $validUrlQueryEndingChars = '[a-z0-9_&=#\/\-]'; + $validPortNumber = '[0-9]+'; + + $regexp = '/(?:' # $1 Complete match (preg_match() already matches everything.) + . '(' . $validUrlPrecedingChars . ')' # $2 Preceding characters + . '(' # $3 Complete URL + . '(https?:\/\/)?' # $4 Protocol (optional) + . '(' . static::getValidDomain() . ')' # $5 Domain(s) + . '(?::(' . $validPortNumber . '))?' # $6 Port number (optional) + . '(\/' . static::getValidUrlPath() . '*)?' # $7 URL Path + . '(\?' . $validUrlQueryChars . '*' . $validUrlQueryEndingChars . ')?' # $8 Query String + . ')' + . ')/iux'; + } + + return $regexp; + } + + /** + * Get domain valid chars + * + * @return string + */ + private static function getDomainValidChars() + { + return '0-9a-z' . static::$latinAccents; + } + + /** + * Get valid subdomain + * + * @return string + */ + private static function getValidSubdomain() + { + $domainValidChars = static::getDomainValidChars(); + + return '(?>(?:[' . $domainValidChars . '][' . $domainValidChars . '\-_]*)?[' . $domainValidChars . ']\.)'; + } + + /** + * Get valid domain name + * + * @return string + */ + private static function getValidDomainName() + { + $domainValidChars = static::getDomainValidChars(); + + return '(?:(?:[' . $domainValidChars . '][' . $domainValidChars . '\-]*)?[' . $domainValidChars . ']\.)'; + } + + /** + * Get domain valid unicode chars + * + * @return string + */ + private static function getDomainValidUnicodeChars() + { + return '[^\p{P}\p{Z}\p{C}' . static::$invalidCharacters . static::$spaces . ']'; + } + + /** + * Get valid domain + * + * @return string + */ + private static function getValidDomain() + { + $validSubdomain = static::getValidSubdomain(); + $validDomainName = static::getValidDomainName(); + $domainValidUnicodeChars = static::getDomainValidUnicodeChars(); + $validGTLD = TldLists::getValidGTLD(); + $validCcTLD = TldLists::getValidCcTLD(); + + return '' + // subdomains + domain + TLD + // e.g. www.twitter.com, foo.co.jp, bar.co.uk + . '(?:' . $validSubdomain . '+' . $validDomainName + . '(?:' . $validGTLD . '|' . $validCcTLD . '|' . static::$validPunycode . '))' + // domain + gTLD | protocol + unicode domain + gTLD + . '|(?:' + . '(?:' + . $validDomainName . '|(?:(?<=http:\/\/|https:\/\/)' . $domainValidUnicodeChars . '+\.)' + . ')' + . $validGTLD + . ')' + // domain + gTLD | some ccTLD + // e.g. twitter.com + . '|(?:' . $validDomainName . static::$validPunycode . ')' + . '|(?:' . $validDomainName . static::$validSpecialCcTLD . ')' + // protocol + domain + ccTLD | protocol + unicode domain + ccTLD + . '|(?:(?<=http:\/\/|https:\/\/)' + . '(?:' . $validDomainName . '|' . $domainValidUnicodeChars . '+\.)' + . $validCcTLD . ')' + // domain + ccTLD + '/' + // e.g. t.co/ + . '|(?:' . $validDomainName . $validCcTLD . '(?=\/))'; + } + + /** + * Get valid url path + * + * @return string + */ + private static function getValidUrlPath() + { + $validGeneralUrlPathChars = '[a-z\p{Cyrillic}0-9!\*;:=\+\,\.\$\/%#\[\]\-_~&|@' . static::$latinAccents . ']'; + + # Allow URL paths to contain up to two nested levels of balanced parentheses: + # 1. Used in Wikipedia URLs, e.g. /Primer_(film) + # 2. Used in IIS sessions, e.g. /S(dfd346)/ + # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/ + $validUrlBalancedParens = '(?:\(' + . '(?:' . $validGeneralUrlPathChars . '+' + . '|' + // allow one nested level of balanced parentheses + . '(?:' + . $validGeneralUrlPathChars . '*' + . '\(' . $validGeneralUrlPathChars . '+' . '\)' + . $validGeneralUrlPathChars . '*' + . ')' + . ')' + . '\))'; + # Valid end-of-path characters (so /foo. does not gobble the period). + # 1. Allow =&# for empty URL parameters and other URL-join artifacts. + $validUrlPathEndingChars = '[a-z\p{Cyrillic}0-9=_#\/\+\-' . static::$latinAccents . ']|(?:' . $validUrlBalancedParens . ')'; + + return '(?:(?:' + . $validGeneralUrlPathChars . '*(?:' + . $validUrlBalancedParens . ' ' + . $validGeneralUrlPathChars . '*)*' + . $validUrlPathEndingChars . ')|(?:@' + . $validGeneralUrlPathChars . '+\/))'; + } + // ================================================================================================================= # NOTE: PHP doesn't have Ruby's $' (dollar apostrophe) so we have to capture @@ -681,6 +816,3 @@ public static function getValidateUrlFragmentMatcher() return $regexp; } } - -# Cause regular expressions to be initialized as soon as this file is loaded: -Regex::__static(); From 320d905c6fabaa9ed2bc0a5592d21b9280cae303 Mon Sep 17 00:00:00 2001 From: nojimage Date: Thu, 28 Dec 2017 19:03:18 +0900 Subject: [PATCH 09/10] drop Regex extends --- lib/Twitter/Text/Autolink.php | 16 ++++++++++++---- lib/Twitter/Text/Extractor.php | 11 +++++++++-- lib/Twitter/Text/HitHighlighter.php | 15 +++++++++++---- lib/Twitter/Text/Validator.php | 11 +++++++++-- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/lib/Twitter/Text/Autolink.php b/lib/Twitter/Text/Autolink.php index b25fe49..6584718 100644 --- a/lib/Twitter/Text/Autolink.php +++ b/lib/Twitter/Text/Autolink.php @@ -30,7 +30,7 @@ * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0 * @package Twitter.Text */ -class Autolink extends Regex +class Autolink { /** @@ -139,6 +139,13 @@ class Autolink extends Regex */ protected $extractor = null; + /** + * The tweet to be used in parsing. + * + * @var string + */ + protected $tweet = ''; + /** * Provides fluent method chaining. * @@ -170,13 +177,14 @@ public function __construct($tweet = null, $escape = true, $full_encode = false) { if ($escape && !empty($tweet)) { if ($full_encode) { - parent::__construct(htmlentities($tweet, ENT_QUOTES, 'UTF-8', false)); + $this->tweet = htmlentities($tweet, ENT_QUOTES, 'UTF-8', false); } else { - parent::__construct(htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false)); + $this->tweet = htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false); } } else { - parent::__construct($tweet); + $this->tweet = $tweet; } + $this->extractor = Extractor::create(); } diff --git a/lib/Twitter/Text/Extractor.php b/lib/Twitter/Text/Extractor.php index 963d7d2..1c6da07 100644 --- a/lib/Twitter/Text/Extractor.php +++ b/lib/Twitter/Text/Extractor.php @@ -29,7 +29,7 @@ * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0 * @package Twitter.Text */ -class Extractor extends Regex +class Extractor { /** @@ -37,6 +37,13 @@ class Extractor extends Regex */ protected $extractURLWithoutProtocol = true; + /** + * The tweet to be used in parsing. + * + * @var string + */ + protected $tweet = ''; + /** * Provides fluent method chaining. * @@ -60,7 +67,7 @@ public static function create($tweet = null) */ public function __construct($tweet = null) { - parent::__construct($tweet); + $this->tweet = $tweet; } /** diff --git a/lib/Twitter/Text/HitHighlighter.php b/lib/Twitter/Text/HitHighlighter.php index 0ed0387..2697592 100644 --- a/lib/Twitter/Text/HitHighlighter.php +++ b/lib/Twitter/Text/HitHighlighter.php @@ -27,7 +27,7 @@ * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0 * @package Twitter.Text */ -class HitHighlighter extends Regex +class HitHighlighter { /** @@ -37,6 +37,13 @@ class HitHighlighter extends Regex */ protected $tag = 'em'; + /** + * The tweet to be used in parsing. + * + * @var string + */ + protected $tweet = ''; + /** * Provides fluent method chaining. * @@ -67,12 +74,12 @@ public function __construct($tweet = null, $escape = true, $full_encode = false) { if (!empty($tweet) && $escape) { if ($full_encode) { - parent::__construct(htmlentities($tweet, ENT_QUOTES, 'UTF-8', false)); + $this->tweet = htmlentities($tweet, ENT_QUOTES, 'UTF-8', false); } else { - parent::__construct(htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false)); + $this->tweet = htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false); } } else { - parent::__construct($tweet); + $this->tweet = $tweet; } } diff --git a/lib/Twitter/Text/Validator.php b/lib/Twitter/Text/Validator.php index dbe79ee..c53ae9c 100644 --- a/lib/Twitter/Text/Validator.php +++ b/lib/Twitter/Text/Validator.php @@ -27,7 +27,7 @@ * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0 * @package Twitter.Text */ -class Validator extends Regex +class Validator { /** @@ -57,6 +57,13 @@ class Validator extends Regex */ protected $extractor = null; + /** + * The tweet to be used in parsing. + * + * @var string + */ + protected $tweet = ''; + /** * Provides fluent method chaining. * @@ -79,10 +86,10 @@ public static function create($tweet = null, $config = null) */ public function __construct($tweet = null, $config = null) { - parent::__construct($tweet); if (!empty($config)) { $this->setConfiguration($config); } + $this->tweet = $tweet; $this->extractor = Extractor::create(); } From c763596ee45123cba643c4d31221ce0c87ce921c Mon Sep 17 00:00:00 2001 From: nojimage Date: Thu, 28 Dec 2017 19:08:27 +0900 Subject: [PATCH 10/10] add test PHP7.2 on travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 7acd194..2330481 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ php: - 5.6 - 7.0 - 7.1 + - 7.2 dist: trusty sudo: false