diff --git a/build.gradle.kts b/build.gradle.kts index 1d3367c..f024f9d 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -27,11 +27,16 @@ repositories { dependencies { implementation(libs.jackson.databind) implementation(libs.mockneat) + implementation(libs.bundles.logger.api) + implementation(libs.okhttp.core) + + runtimeOnly(libs.logger.impl) testImplementation(kotlin("test")) testImplementation(platform(libs.junit)) testImplementation("org.junit.jupiter:junit-jupiter-params") testImplementation(libs.assertj) + testImplementation(libs.okhttp.mock) jmhAnnotationProcessor(libs.jmh.ann) } diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 7a3367c..8d70b79 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -11,8 +11,14 @@ junit = { module = "org.junit:junit-bom", version.ref = "junit" } assertj = { module = "org.assertj:assertj-core", version = "3.23.1" } mockneat = { module = "net.andreinc:mockneat", version = "0.4.8" } jmh-ann = { module = "org.openjdk.jmh:jmh-generator-annprocess", version.ref = "jmh.tools" } +logger-api4j = { module = "org.slf4j:slf4j-api", version = "2.0.6" } +logger-api4k = { module = "io.github.microutils:kotlin-logging-jvm", version = "3.0.4" } +logger-impl = { module = "ch.qos.logback:logback-classic", version = "1.4.5" } +okhttp-core = { module = "com.squareup.okhttp3:okhttp", version = "4.10.0" } +okhttp-mock = { module = "com.github.gmazzo:okhttp-mock", version = "1.5.0" } [bundles] +logger-api = ["logger.api4j", "logger.api4k"] [plugins] jmh = { id = "me.champeau.jmh", version = "0.6.8" } diff --git a/src/main/kotlin/name/valery1707/problem/LinkChecker.kt b/src/main/kotlin/name/valery1707/problem/LinkChecker.kt new file mode 100644 index 0000000..527edf7 --- /dev/null +++ b/src/main/kotlin/name/valery1707/problem/LinkChecker.kt @@ -0,0 +1,169 @@ +package name.valery1707.problem + +import okhttp3.Headers +import okhttp3.OkHttpClient +import okhttp3.Request +import java.net.URI +import java.nio.file.Path +import java.time.Duration +import java.time.Instant +import java.time.format.DateTimeFormatter +import java.time.temporal.ChronoField.NANO_OF_SECOND +import kotlin.io.path.ExperimentalPathApi +import kotlin.io.path.PathWalkOption +import kotlin.io.path.readText +import kotlin.io.path.walk + +/** + * todo Add description + * todo Make async (probably with coroutines) + */ +class LinkChecker(private val root: Path) { + /** + * Сканируем все файлы из директории, ищем в тексте ссылки, проверяем их на доступность + */ + @OptIn(ExperimentalPathApi::class) + fun findInvalid(client: OkHttpClient): Map { + val filePos2uriCheck = root + .walk(PathWalkOption.FOLLOW_LINKS) + .map { root.relativize(it) } + .map { + it to loadFile(root.resolve(it)) + } + .flatMap { pathWithText -> + pathWithText.second.findUri() + .map { pathWithText.first to it.first to it.second } + } + .take(20)// todo Remove limit + .map { + it.first to (it.second to it.second.check(client)) + } + .filter { it.second.second.first != 200 } + .toList() + // todo remove + logger.debug { "filePos2uriCheck = $filePos2uriCheck" } + return filePos2uriCheck + .associateBy( + { "${it.first.first}:${it.first.second}" }, + { + when (it.second.second.first) { + in HTTP_REDIRECT -> "${it.second.first} -> ${it.second.second.first} -> ${it.second.second.second}" + -1 -> "${it.second.first} -> ${it.second.second.first} -> ${it.second.second.second.query}" + else -> "${it.second.first} -> ${it.second.second.first}" + } + }, + ) + } + + private fun loadFile(path: Path): String { + return path.readText() + } + + companion object { + private val logger = mu.KotlinLogging.logger {} + + /** + * https://stackoverflow.com/a/45690571 + */ + private val URI_PATTERN_FULL = ("" + + "(?[a-z][a-z0-9+.-]+):" + + "(?\\/\\/(?[^@]+@)?(?[a-z0-9.\\-_~]+)(?:\\d+)?)?" + + "(?(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@])+(?:\\/(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@])*)*|(?:\\/(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@])+)*)?" + + "(?\\?(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@]|[/?])+)?" + + "(?\\#(?:[a-z0-9-._~]|%[a-f0-9]|[!\$&'()*+,;=:@]|[/?])+)?" + + "").toRegex(RegexOption.IGNORE_CASE) + + private val URI_PATTERN_SIMPLE = URI_PATTERN_FULL.pattern + .replace("()", "") + .replace("?:", "") + .replace("+)*)?(?", "*)*)?(?") + .replace("(?[^@]+@)", "(?[\\w]+@)") + .toRegex(RegexOption.IGNORE_CASE) + + private fun MatchResult.position(text: String): String { + val prefix = text.subSequence(0, range.last) + val col = range.first - prefix.indexOfLast { it == '\n' } + val line = 1 + prefix.count { it == '\n' } + return "$line:$col" + } + + private fun String.findUri() = URI_PATTERN_SIMPLE + .findAll(this) + .filter { it.value.startsWith("http") } + .map { uri -> + uri.position(this) to uri.value.trimEnd('.').toURI() + } + .filter { it.second != null } + .map { it.first to it.second!! } + .filter { it.second.scheme in setOf("http", "https") } + + internal fun String.toURI(): URI? = try { + URI.create(this) + } catch (e: IllegalArgumentException) { + null + } + + private fun URI.check(client: OkHttpClient): Pair { + val request = Request.Builder().url(this.toURL()).get().build() + // todo Cache + return try { + logger.info("Check: $this") + client.newCall(request).execute().use { response -> + when (response.code) { + //Redirects: extract new location + in HTTP_REDIRECT -> response.code to response.header("Location")!!.toURI()!! + + //Rate limiting: wait and retry + in HTTP_RATE_LIMIT -> { + val now = Instant.now() + val await = response.headers.rateLimitAwait(now) ?: 500 + + logger.debug("Await: $await ms") + Thread.sleep(await) + check(client) + } + + else -> response.code to response.request.url.toUri() + } + } + } catch (e: Exception) { + logger.error(e) { "Handle error on checking $this" } + -1 to URI.create("http://host?message=${e.message?.replace(" ", "%20")}") + } + } + + private val HTTP_REDIRECT = setOf(301, 302, 307, 308) + private val HTTP_RATE_LIMIT = setOf(403, 429) + + private fun Headers.rateLimitAwait(now: Instant): Long? = HTTP_RATE_LIMIT_EXTRACTORS + .flatMap { values(it.key).asSequence().map { v -> it.value(v.trim(), now) } } + .filterNotNull() + .firstOrNull { it >= 0 } + + private val HTTP_RATE_LIMIT_EXTRACTORS: Map Long?> = mapOf( + // https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#checking-your-rate-limit-status + "x-ratelimit-reset" to { value, now -> + value + .toLong() + .let(Instant::ofEpochSecond) + .let { Duration.between(now.with(NANO_OF_SECOND, 0), it) } + .let(Duration::toMillis) + }, + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After + "Retry-After" to { value, now -> + if (value.isDigit()) value.toLong() + else HTTP_DATE_FORMAT + .parse(value, Instant::from) + .let { Duration.between(now.with(NANO_OF_SECOND, 0), it) } + .let(Duration::toMillis) + }, + ) + + /** + * @see Specification + */ + internal val HTTP_DATE_FORMAT = DateTimeFormatter.RFC_1123_DATE_TIME + + private fun String.isDigit(): Boolean = this.all { it.isDigit() } + } +} diff --git a/src/test/kotlin/name/valery1707/problem/LinkCheckerTest.kt b/src/test/kotlin/name/valery1707/problem/LinkCheckerTest.kt new file mode 100644 index 0000000..7fde941 --- /dev/null +++ b/src/test/kotlin/name/valery1707/problem/LinkCheckerTest.kt @@ -0,0 +1,145 @@ +package name.valery1707.problem + +import name.valery1707.problem.LinkChecker.Companion.HTTP_DATE_FORMAT +import name.valery1707.problem.LinkChecker.Companion.toURI +import okhttp3.Headers.Companion.toHeaders +import okhttp3.OkHttpClient +import okhttp3.mock.MockInterceptor +import okhttp3.mock.body +import org.assertj.core.api.Assertions.assertThat +import org.assertj.core.api.Assertions.fail +import org.assertj.core.api.Assumptions.assumeThat +import org.junit.jupiter.api.Test +import org.junit.jupiter.params.ParameterizedTest +import org.junit.jupiter.params.provider.ValueSource +import java.net.InetSocketAddress +import java.net.ProxySelector +import java.nio.file.Path +import java.time.Instant +import java.time.ZoneId +import java.util.* +import kotlin.io.path.toPath + +typealias ResponseMeta = () -> Pair> + +internal class LinkCheckerTest { + + @ParameterizedTest + @ValueSource( + strings = [ + "./path/to/real/project", + ], + ) + internal fun checkReal(path: Path) { + assumeThat(path).isDirectory.isReadable + val client = OkHttpClient.Builder() + .followRedirects(false).followSslRedirects(false) + .proxySelector(proxy) + .build() + val checker = LinkChecker(path) + assertThat(checker.findInvalid(client)).isEmpty() + } + + @Test + @Suppress("HttpUrlsUsage") + internal fun testDemo() { + val path = javaClass.getResource("/linkChecker/Demo.md")?.toURI()?.toPath()?.parent + assertThat(path).isNotNull.isDirectory.isReadable + + fun ok(): ResponseMeta = { 200 to mapOf() } + fun notFound(): ResponseMeta = { 404 to mapOf() } + fun redirect(code: Int, target: String): ResponseMeta = { code to mapOf("Location" to target) } + fun rateLimitGH(awaitMillis: Long): ResponseMeta = { 403 to mapOf("x-ratelimit-reset" to Instant.now().plusMillis(awaitMillis).epochSecond.toString()) } + fun rateLimitSpecSec(awaitSec: Int): ResponseMeta = { 429 to mapOf("Retry-After" to awaitSec.toString()) } + fun rateLimitSpecDate(awaitMillis: Long): ResponseMeta = { + 429 to mapOf("Retry-After" to HTTP_DATE_FORMAT.format(Instant.now().plusMillis(awaitMillis).atZone(ZoneId.systemDefault()))) + } + + //Check links via: curl --silent -X GET --head 'URL' + val client = mockHttpClient( + mapOf( + "https://habr.com/ru/company/otus/blog/707724/comments" to mutableListOf( + redirect(302, "https://habr.com/ru/company/otus/blog/707724/comments/"), + ), + "https://habr.com/ru/company/otus/blog/707724/comments/" to mutableListOf( + ok(), + ), + "http://schema.org/" to mutableListOf( + redirect(301, "https://schema.org/"), + ), + "https://github.com/androidx/androidx/blob/androidx-main/build.gradle" to mutableListOf( + //Will wait some time + rateLimitGH(2111), + //Will wait zero time + rateLimitGH(10), + //Will wait default time + rateLimitGH(-1500), + ok(), + ), + "https://www.bearer.com/" to mutableListOf( + // Use variant with "delay-seconds" + rateLimitSpecSec(1), + // Use variant with "http-date" + rateLimitSpecDate(100), + ok(), + ), + "https://github.com/androidx/androidx/blob/androidx-main/buildSrc/public/src/main/kotlin/androidx/build/LibraryGroups.kt" to mutableListOf( + notFound(), + ), + ), + ) + + val checker = LinkChecker(path!!) + + assertThat(checker.findInvalid(client)).containsExactlyInAnyOrderEntriesOf( + mapOf( + "Demo.md:1:25" to "https://habr.com/ru/company/otus/blog/707724/comments -> 302 -> https://habr.com/ru/company/otus/blog/707724/comments/", + "Demo.md:3:14" to "http://schema.org -> 301 -> https://schema.org/", + "Demo.md:7:14" to "https://github.com/androidx/androidx/blob/androidx-main/buildSrc/public/src/main/kotlin/androidx/build/LibraryGroups.kt -> 404", + ), + ) + } + + @ParameterizedTest + @ValueSource( + strings = [ + "some invalid uri", + ], + ) + internal fun testInvalidUriString(uriString: String) { + assertThat(uriString.toURI()).isNull() + } + + private val proxy: ProxySelector by lazy { + sequenceOf( + "genproxy" to 8080, + ) + .map { InetSocketAddress(it.first, it.second) } + .filterNot { it.isUnresolved } + .map { ProxySelector.of(it) } + .firstOrNull() + ?: ProxySelector.getDefault() + } + + private companion object { + fun mockHttpClient(responses: Map>): OkHttpClient { + val interceptor = MockInterceptor() + + interceptor.addRule() + .anyTimes() + .answer { req -> + val uri = req.url.toUri() + val meta = ((responses[uri.toString()] ?: fail("Unknown URI: $uri")).removeFirstOrNull() ?: fail("Too many requests for URI: $uri"))() + okhttp3.Response.Builder() + .code(meta.first) + .headers(meta.second.toHeaders()) + .body("") + } + + return OkHttpClient.Builder() + .addInterceptor(interceptor) + .build(); + } + } + +} diff --git a/src/test/resources/linkChecker/Demo.md b/src/test/resources/linkChecker/Demo.md new file mode 100644 index 0000000..362f0df --- /dev/null +++ b/src/test/resources/linkChecker/Demo.md @@ -0,0 +1,7 @@ +Link with name: [named](https://habr.com/ru/company/otus/blog/707724/comments). +Link with name: [named](https://habr.com/ru/company/otus/blog/707724/comments/). +Link inlined http://schema.org. +Link with rate limiting: +* https://github.com/androidx/androidx/blob/androidx-main/build.gradle +* https://www.bearer.com/ +Link absent: https://github.com/androidx/androidx/blob/androidx-main/buildSrc/public/src/main/kotlin/androidx/build/LibraryGroups.kt diff --git a/versions.lock b/versions.lock index 5f1b93a..bca0543 100644 --- a/versions.lock +++ b/versions.lock @@ -1,4 +1,6 @@ # Run ./gradlew --write-locks to regenerate this file +ch.qos.logback:logback-classic:1.4.5 (1 constraints: 0c050136) +ch.qos.logback:logback-core:1.4.5 (1 constraints: 0b0d071d) com.fasterxml.jackson:jackson-bom:2.14.1 (3 constraints: 7f37c0d3) com.fasterxml.jackson.core:jackson-annotations:2.14.1 (2 constraints: 3f21f093) com.fasterxml.jackson.core:jackson-core:2.14.1 (2 constraints: 3f21f093) @@ -6,6 +8,7 @@ com.fasterxml.jackson.core:jackson-databind:2.14.1 (2 constraints: f1138582) com.github.mifmif:generex:1.0.2 (1 constraints: e00921ac) commons-codec:commons-codec:1.15 (1 constraints: b6094aa2) dk.brics.automaton:automaton:1.11-8 (1 constraints: 900be8e7) +io.github.microutils:kotlin-logging-jvm:3.0.4 (1 constraints: 0905fe35) net.andreinc:aleph:0.1.1 (1 constraints: df091eac) net.andreinc:markovneat:1.8 (1 constraints: 88099b98) net.andreinc:mockneat:0.4.8 (1 constraints: 0e05ff35) @@ -13,9 +16,10 @@ org.apache.commons:commons-lang3:3.12.0 (2 constraints: ab17f565) org.apache.commons:commons-text:1.9 (1 constraints: 89099c98) org.jetbrains:annotations:13.0 (1 constraints: df0e795c) org.jetbrains.kotlin:kotlin-stdlib:1.7.22 (3 constraints: 463049c4) -org.jetbrains.kotlin:kotlin-stdlib-common:1.7.22 (1 constraints: 450fd27a) +org.jetbrains.kotlin:kotlin-stdlib-common:1.7.22 (2 constraints: 71207b44) org.jetbrains.kotlin:kotlin-stdlib-jdk7:1.7.22 (1 constraints: e310fbd2) -org.jetbrains.kotlin:kotlin-stdlib-jdk8:1.7.22 (1 constraints: 3e05453b) +org.jetbrains.kotlin:kotlin-stdlib-jdk8:1.7.22 (2 constraints: 6a16ed11) +org.slf4j:slf4j-api:2.0.6 (3 constraints: 0723153f) [Test dependencies] net.bytebuddy:byte-buddy:1.12.10 (1 constraints: 7b0bbcea)