Skip to content

Make a "tool" which useful for checking links in directory for accessibility #73

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,16 @@ repositories {
dependencies {
implementation(libs.jackson.databind)
implementation(libs.mockneat)
implementation(libs.bundles.logger.api)
implementation(libs.okhttp.core)

runtimeOnly(libs.logger.impl)

testImplementation(kotlin("test"))
testImplementation(platform(libs.junit))
testImplementation("org.junit.jupiter:junit-jupiter-params")
testImplementation(libs.assertj)
testImplementation(libs.okhttp.mock)

jmhAnnotationProcessor(libs.jmh.ann)
}
Expand Down
6 changes: 6 additions & 0 deletions gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,14 @@ junit = { module = "org.junit:junit-bom", version.ref = "junit" }
assertj = { module = "org.assertj:assertj-core", version = "3.23.1" }
mockneat = { module = "net.andreinc:mockneat", version = "0.4.8" }
jmh-ann = { module = "org.openjdk.jmh:jmh-generator-annprocess", version.ref = "jmh.tools" }
logger-api4j = { module = "org.slf4j:slf4j-api", version = "2.0.6" }
logger-api4k = { module = "io.github.microutils:kotlin-logging-jvm", version = "3.0.4" }
logger-impl = { module = "ch.qos.logback:logback-classic", version = "1.4.5" }
okhttp-core = { module = "com.squareup.okhttp3:okhttp", version = "4.10.0" }
okhttp-mock = { module = "com.github.gmazzo:okhttp-mock", version = "1.5.0" }

[bundles]
logger-api = ["logger.api4j", "logger.api4k"]

[plugins]
jmh = { id = "me.champeau.jmh", version = "0.6.8" }
Expand Down
169 changes: 169 additions & 0 deletions src/main/kotlin/name/valery1707/problem/LinkChecker.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
package name.valery1707.problem

import okhttp3.Headers
import okhttp3.OkHttpClient
import okhttp3.Request
import java.net.URI
import java.nio.file.Path
import java.time.Duration
import java.time.Instant
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoField.NANO_OF_SECOND
import kotlin.io.path.ExperimentalPathApi
import kotlin.io.path.PathWalkOption
import kotlin.io.path.readText
import kotlin.io.path.walk

/**
* todo Add description
* todo Make async (probably with coroutines)
*/
class LinkChecker(private val root: Path) {
/**
* Сканируем все файлы из директории, ищем в тексте ссылки, проверяем их на доступность
*/
@OptIn(ExperimentalPathApi::class)
fun findInvalid(client: OkHttpClient): Map<String, String> {
val filePos2uriCheck = root
.walk(PathWalkOption.FOLLOW_LINKS)
.map { root.relativize(it) }
.map {
it to loadFile(root.resolve(it))
}
.flatMap { pathWithText ->
pathWithText.second.findUri()
.map { pathWithText.first to it.first to it.second }
}
.take(20)// todo Remove limit
.map {
it.first to (it.second to it.second.check(client))
}
.filter { it.second.second.first != 200 }
.toList()
// todo remove
logger.debug { "filePos2uriCheck = $filePos2uriCheck" }
return filePos2uriCheck
.associateBy(
{ "${it.first.first}:${it.first.second}" },
{
when (it.second.second.first) {
in HTTP_REDIRECT -> "${it.second.first} -> ${it.second.second.first} -> ${it.second.second.second}"
-1 -> "${it.second.first} -> ${it.second.second.first} -> ${it.second.second.second.query}"
else -> "${it.second.first} -> ${it.second.second.first}"
}
},
)
}

private fun loadFile(path: Path): String {
return path.readText()
}

companion object {
private val logger = mu.KotlinLogging.logger {}

/**
* https://stackoverflow.com/a/45690571
*/
private val URI_PATTERN_FULL = ("" +
"(?<scheme>[a-z][a-z0-9+.-]+):" +
"(?<authority>\\/\\/(?<user>[^@]+@)?(?<host>[a-z0-9.\\-_~]+)(?<port>:\\d+)?)?" +
"(?<path>(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@])+(?:\\/(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@])*)*|(?:\\/(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@])+)*)?" +
"(?<query>\\?(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@]|[/?])+)?" +
"(?<fragment>\\#(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@]|[/?])+)?" +
"").toRegex(RegexOption.IGNORE_CASE)

private val URI_PATTERN_SIMPLE = URI_PATTERN_FULL.pattern
.replace("()", "")
.replace("?:", "")
.replace("+)*)?(?<query>", "*)*)?(?<query>")
.replace("(?<user>[^@]+@)", "(?<user>[\\w]+@)")
.toRegex(RegexOption.IGNORE_CASE)

private fun MatchResult.position(text: String): String {
val prefix = text.subSequence(0, range.last)
val col = range.first - prefix.indexOfLast { it == '\n' }
val line = 1 + prefix.count { it == '\n' }
return "$line:$col"
}

private fun String.findUri() = URI_PATTERN_SIMPLE
.findAll(this)
.filter { it.value.startsWith("http") }
.map { uri ->
uri.position(this) to uri.value.trimEnd('.').toURI()
}
.filter { it.second != null }
.map { it.first to it.second!! }
.filter { it.second.scheme in setOf("http", "https") }

internal fun String.toURI(): URI? = try {
URI.create(this)
} catch (e: IllegalArgumentException) {
null
}

private fun URI.check(client: OkHttpClient): Pair<Int, URI> {
val request = Request.Builder().url(this.toURL()).get().build()
// todo Cache
return try {
logger.info("Check: $this")
client.newCall(request).execute().use { response ->
when (response.code) {
//Redirects: extract new location
in HTTP_REDIRECT -> response.code to response.header("Location")!!.toURI()!!

//Rate limiting: wait and retry
in HTTP_RATE_LIMIT -> {
val now = Instant.now()
val await = response.headers.rateLimitAwait(now) ?: 500

logger.debug("Await: $await ms")
Thread.sleep(await)
check(client)
}

else -> response.code to response.request.url.toUri()
}
}
} catch (e: Exception) {
logger.error(e) { "Handle error on checking $this" }
-1 to URI.create("http://host?message=${e.message?.replace(" ", "%20")}")
}
}

private val HTTP_REDIRECT = setOf(301, 302, 307, 308)
private val HTTP_RATE_LIMIT = setOf(403, 429)

private fun Headers.rateLimitAwait(now: Instant): Long? = HTTP_RATE_LIMIT_EXTRACTORS
.flatMap { values(it.key).asSequence().map { v -> it.value(v.trim(), now) } }
.filterNotNull()
.firstOrNull { it >= 0 }

private val HTTP_RATE_LIMIT_EXTRACTORS: Map<String, (String, Instant) -> Long?> = mapOf(
// https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#checking-your-rate-limit-status
"x-ratelimit-reset" to { value, now ->
value
.toLong()
.let(Instant::ofEpochSecond)
.let { Duration.between(now.with(NANO_OF_SECOND, 0), it) }
.let(Duration::toMillis)
},
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After
"Retry-After" to { value, now ->
if (value.isDigit()) value.toLong()
else HTTP_DATE_FORMAT
.parse(value, Instant::from)
.let { Duration.between(now.with(NANO_OF_SECOND, 0), it) }
.let(Duration::toMillis)
},
)

/**
* @see <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Date">Specification</a>
*/
internal val HTTP_DATE_FORMAT = DateTimeFormatter.RFC_1123_DATE_TIME

private fun String.isDigit(): Boolean = this.all { it.isDigit() }
}
}
145 changes: 145 additions & 0 deletions src/test/kotlin/name/valery1707/problem/LinkCheckerTest.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
package name.valery1707.problem

import name.valery1707.problem.LinkChecker.Companion.HTTP_DATE_FORMAT
import name.valery1707.problem.LinkChecker.Companion.toURI
import okhttp3.Headers.Companion.toHeaders
import okhttp3.OkHttpClient
import okhttp3.mock.MockInterceptor
import okhttp3.mock.body
import org.assertj.core.api.Assertions.assertThat
import org.assertj.core.api.Assertions.fail
import org.assertj.core.api.Assumptions.assumeThat
import org.junit.jupiter.api.Test
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.ValueSource
import java.net.InetSocketAddress
import java.net.ProxySelector
import java.nio.file.Path
import java.time.Instant
import java.time.ZoneId
import java.util.*
import kotlin.io.path.toPath

typealias ResponseMeta = () -> Pair<Int, Map<String, String>>

internal class LinkCheckerTest {

@ParameterizedTest
@ValueSource(
strings = [
"./path/to/real/project",
],
)
internal fun checkReal(path: Path) {
assumeThat(path).isDirectory.isReadable
val client = OkHttpClient.Builder()
.followRedirects(false).followSslRedirects(false)
.proxySelector(proxy)
.build()
val checker = LinkChecker(path)
assertThat(checker.findInvalid(client)).isEmpty()
}

@Test
@Suppress("HttpUrlsUsage")
internal fun testDemo() {
val path = javaClass.getResource("/linkChecker/Demo.md")?.toURI()?.toPath()?.parent
assertThat(path).isNotNull.isDirectory.isReadable

fun ok(): ResponseMeta = { 200 to mapOf() }
fun notFound(): ResponseMeta = { 404 to mapOf() }
fun redirect(code: Int, target: String): ResponseMeta = { code to mapOf("Location" to target) }
fun rateLimitGH(awaitMillis: Long): ResponseMeta = { 403 to mapOf("x-ratelimit-reset" to Instant.now().plusMillis(awaitMillis).epochSecond.toString()) }
fun rateLimitSpecSec(awaitSec: Int): ResponseMeta = { 429 to mapOf("Retry-After" to awaitSec.toString()) }
fun rateLimitSpecDate(awaitMillis: Long): ResponseMeta = {
429 to mapOf("Retry-After" to HTTP_DATE_FORMAT.format(Instant.now().plusMillis(awaitMillis).atZone(ZoneId.systemDefault())))
}

//Check links via: curl --silent -X GET --head 'URL'
val client = mockHttpClient(
mapOf(
"https://habr.com/ru/company/otus/blog/707724/comments" to mutableListOf(
redirect(302, "https://habr.com/ru/company/otus/blog/707724/comments/"),
),
"https://habr.com/ru/company/otus/blog/707724/comments/" to mutableListOf(
ok(),
),
"http://schema.org/" to mutableListOf(
redirect(301, "https://schema.org/"),
),
"https://github.com/androidx/androidx/blob/androidx-main/build.gradle" to mutableListOf(
//Will wait some time
rateLimitGH(2111),
//Will wait zero time
rateLimitGH(10),
//Will wait default time
rateLimitGH(-1500),
ok(),
),
"https://www.bearer.com/" to mutableListOf(
// Use variant with "delay-seconds"
rateLimitSpecSec(1),
// Use variant with "http-date"
rateLimitSpecDate(100),
ok(),
),
"https://github.com/androidx/androidx/blob/androidx-main/buildSrc/public/src/main/kotlin/androidx/build/LibraryGroups.kt" to mutableListOf(
notFound(),
),
),
)

val checker = LinkChecker(path!!)

assertThat(checker.findInvalid(client)).containsExactlyInAnyOrderEntriesOf(
mapOf(
"Demo.md:1:25" to "https://habr.com/ru/company/otus/blog/707724/comments -> 302 -> https://habr.com/ru/company/otus/blog/707724/comments/",
"Demo.md:3:14" to "http://schema.org -> 301 -> https://schema.org/",
"Demo.md:7:14" to "https://github.com/androidx/androidx/blob/androidx-main/buildSrc/public/src/main/kotlin/androidx/build/LibraryGroups.kt -> 404",
),
)
}

@ParameterizedTest
@ValueSource(
strings = [
"some invalid uri",
],
)
internal fun testInvalidUriString(uriString: String) {
assertThat(uriString.toURI()).isNull()
}

private val proxy: ProxySelector by lazy {
sequenceOf(
"genproxy" to 8080,
)
.map { InetSocketAddress(it.first, it.second) }
.filterNot { it.isUnresolved }
.map { ProxySelector.of(it) }
.firstOrNull()
?: ProxySelector.getDefault()
}

private companion object {
fun mockHttpClient(responses: Map<String, MutableList<ResponseMeta>>): OkHttpClient {
val interceptor = MockInterceptor()

interceptor.addRule()
.anyTimes()
.answer { req ->
val uri = req.url.toUri()
val meta = ((responses[uri.toString()] ?: fail("Unknown URI: $uri")).removeFirstOrNull() ?: fail("Too many requests for URI: $uri"))()
okhttp3.Response.Builder()
.code(meta.first)
.headers(meta.second.toHeaders())
.body("")
}

return OkHttpClient.Builder()
.addInterceptor(interceptor)
.build();
}
}

}
7 changes: 7 additions & 0 deletions src/test/resources/linkChecker/Demo.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Link with name: [named](https://habr.com/ru/company/otus/blog/707724/comments).
Link with name: [named](https://habr.com/ru/company/otus/blog/707724/comments/).
Link inlined http://schema.org.
Link with rate limiting:
* https://github.com/androidx/androidx/blob/androidx-main/build.gradle
* https://www.bearer.com/
Link absent: https://github.com/androidx/androidx/blob/androidx-main/buildSrc/public/src/main/kotlin/androidx/build/LibraryGroups.kt
8 changes: 6 additions & 2 deletions versions.lock
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
# Run ./gradlew --write-locks to regenerate this file
ch.qos.logback:logback-classic:1.4.5 (1 constraints: 0c050136)
ch.qos.logback:logback-core:1.4.5 (1 constraints: 0b0d071d)
com.fasterxml.jackson:jackson-bom:2.14.1 (3 constraints: 7f37c0d3)
com.fasterxml.jackson.core:jackson-annotations:2.14.1 (2 constraints: 3f21f093)
com.fasterxml.jackson.core:jackson-core:2.14.1 (2 constraints: 3f21f093)
com.fasterxml.jackson.core:jackson-databind:2.14.1 (2 constraints: f1138582)
com.github.mifmif:generex:1.0.2 (1 constraints: e00921ac)
commons-codec:commons-codec:1.15 (1 constraints: b6094aa2)
dk.brics.automaton:automaton:1.11-8 (1 constraints: 900be8e7)
io.github.microutils:kotlin-logging-jvm:3.0.4 (1 constraints: 0905fe35)
net.andreinc:aleph:0.1.1 (1 constraints: df091eac)
net.andreinc:markovneat:1.8 (1 constraints: 88099b98)
net.andreinc:mockneat:0.4.8 (1 constraints: 0e05ff35)
org.apache.commons:commons-lang3:3.12.0 (2 constraints: ab17f565)
org.apache.commons:commons-text:1.9 (1 constraints: 89099c98)
org.jetbrains:annotations:13.0 (1 constraints: df0e795c)
org.jetbrains.kotlin:kotlin-stdlib:1.7.22 (3 constraints: 463049c4)
org.jetbrains.kotlin:kotlin-stdlib-common:1.7.22 (1 constraints: 450fd27a)
org.jetbrains.kotlin:kotlin-stdlib-common:1.7.22 (2 constraints: 71207b44)
org.jetbrains.kotlin:kotlin-stdlib-jdk7:1.7.22 (1 constraints: e310fbd2)
org.jetbrains.kotlin:kotlin-stdlib-jdk8:1.7.22 (1 constraints: 3e05453b)
org.jetbrains.kotlin:kotlin-stdlib-jdk8:1.7.22 (2 constraints: 6a16ed11)
org.slf4j:slf4j-api:2.0.6 (3 constraints: 0723153f)

[Test dependencies]
net.bytebuddy:byte-buddy:1.12.10 (1 constraints: 7b0bbcea)
Expand Down