This repository has been archived by the owner on Jan 9, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.php
102 lines (75 loc) · 2.59 KB
/
utils.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
<?php
define(SCRAPPR_PREFIX, 'scrappr_');
function strip_tags_content($text, $tags = '', $invert = FALSE) {
preg_match_all('/<(.+?)[\s]*\/?[\s]*>/si', trim($tags), $tags);
$tags = array_unique($tags[1]);
if(is_array($tags) AND count($tags) > 0) {
if($invert == FALSE) {
return preg_replace('@<(?!(?:'. implode('|', $tags) .')\b)(\w+)\b.*?>.*?</\1>@si', '', $text);
}
else {
return preg_replace('@<('. implode('|', $tags) .')\b.*?>.*?</\1>@si', '', $text);
}
}
elseif($invert == FALSE) {
return preg_replace('@<(\w+)\b.*?>.*?</\1>@si', '', $text);
}
return $text;
}
function fetchEntries($format, $selector, $increase = 1, $start = 1, $limit = 0, $minEntryLength = 0) {
define(MAX_LIMIT, 1000);
$entries = array();
$pageId = $start;
while (true) {
$docUrl = sprintf($format, $pageId);
$pq = phpQuery::newDocumentFileHTML($docUrl);
foreach (pq($selector) as $entry) {
$tmp = preg_replace('/<!--(.*)-->/Uis', '', $entry->textContent); //strip html comments
$tmp = trim(strip_tags_content($tmp)); //remove htmt tags with contents
if (strlen($tmp) >= $minEntryLength) {
$entries[] = $tmp;
}
}
$pageId++;
$absoluteNextPageUrl = sprintf($format, $pageId * $increase);
//$relativeNextPageUrl = explode("/", $absoluteNextPageUrl);
//for pages with <base href="...">
$urlComponents = parse_url($docUrl);
$relativeNextPageUrl = str_replace("http://" . $urlComponents['host'] . "/", "", $absoluteNextPageUrl);
$nextPageElement = pq(sprintf('a[href="%s"], a[href="%s]"', $absoluteNextPageUrl, $relativeNextPageUrl));
if (count($nextPageElement) == 0) {
break;
}
if (($pageId > $limit) && ($limit != 0) )
break;
}
return $entries;
}
/**
* Saves given entries in tmp sqlite database
* @param array $entries
* @return string path to the db file
* @throws Exception
*/
function saveEntriesToSqlite($entries) {
$createSyntax = file_get_contents("db/schema.sql");
$insertSyntax = file_get_contents("db/insert.sql");
$tmpFileName = tempnam("./tmp", SCRAPPR_PREFIX);
if (!$tmpFileName) {
throw new Exception("Cannot create tmp file");
}
$path = realpath($tmpFileName);
$pdo = new PDO("sqlite:" . $path);
$pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$pdo->exec($createSyntax);
$stmt = $pdo->prepare($insertSyntax);
$pdo->beginTransaction();
foreach ($entries as $entry) {
$stmt->bindValue(1, $entry);
$stmt->execute();
}
$pdo->commit();
$pdo = null;
return $path;
}
?>