Skip to content

Commit f48d09a

Browse files
authored
Merge pull request #32 from oiwn/new
New version, broke previous one.
2 parents 1e490ae + f5dc799 commit f48d09a

32 files changed

+1955
-977
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,5 @@ Cargo.lock
1919
.env
2020
/tmp
2121
tags
22+
23+
.DS_Store

.tmuxp.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,8 @@ windows:
2020
- window_name: notes
2121
panes:
2222
- shell_command:
23-
- clear
24-
# - emacs -nw notes.org
23+
- emacs -nw notes.org
24+
- window_name: redis
25+
panes:
26+
- shell_command:
27+
- just connect-redis

Cargo.toml

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
[package]
22
name = "capp"
3-
version = "0.3.5"
3+
version = "0.4.1"
44
edition = "2021"
55
license = "MIT"
66
authors = ["oiwn"]
77
description = "Common things i use to build Rust CLI tools for web crawlers."
88
homepage = "https://github.com/oiwn/capp-rs"
99
repository = "https://github.com/oiwn/capp-rs"
1010
readme = "README.md"
11-
keywords = ["mini-celery", "async", "executor"]
11+
keywords = ["web-crawler", "async", "executor"]
1212
categories = ["asynchronous", "web-programming", "concurrency"]
1313
exclude = [
1414
"tmp/*",
@@ -20,25 +20,35 @@ exclude = [
2020
"notes.org"
2121
]
2222

23-
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
23+
[lib]
24+
name = "capp"
25+
path = "src/lib.rs"
26+
doctest = false
27+
28+
[[bin]]
29+
name = "capp"
30+
path = "src/main.rs"
31+
2432

2533
[dependencies]
2634
async-trait = { version = "0.1" }
2735
backoff = { version = "0.4", optional = true, features = ["tokio"] }
2836
derive_builder = { version = "0.20" }
29-
reqwest = { version = "0.12", features = ["gzip", "rustls-tls"], optional = true }
37+
reqwest = { version = "0.12", features = ["gzip", "rustls-tls", "json"], optional = true }
3038
serde = { version = "1.0", features = ["derive"] }
3139
serde_json = { version = "1.0" }
3240
serde_yaml = "0.9"
33-
thiserror = { version = "1.0" }
34-
tokio = { version = "1.39", features = ["full"] }
41+
thiserror = { version = "1" }
42+
tokio = { version = "1.40", features = ["full"] }
3543
uuid = { version = "1.10", features = ["v4", "serde"] }
3644
rustis = { version = "0.13", features = ["tokio-runtime"], optional = true }
37-
once_cell = "1.19"
3845
tracing = "0.1"
3946
tracing-subscriber = "0.3"
40-
anyhow = "1.0"
47+
anyhow = "1"
4148
tracing-futures = "0.2"
49+
indexmap = "2.6"
50+
url = "2.5"
51+
regex = "1.11"
4252

4353
[dev-dependencies]
4454
capp = { path = ".", features = ["http", "healthcheck", "redis"] }
@@ -49,9 +59,9 @@ pin-project-lite = "0.2"
4959
dotenvy = "0.15"
5060
scraper = "0.19"
5161
rand = "0.8"
52-
md5 = "0.7.0"
53-
url = "2.5.0"
54-
base64 = "0.22.1"
62+
md5 = "0.7"
63+
url = "2.5"
64+
base64 = "0.22"
5565

5666
[features]
5767
http = ["dep:backoff", "dep:reqwest"]

Justfile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Common cli tasks
2+
3+
tags:
4+
ctags -R --exclude=*/*.json --exclude=target/* .
5+
6+
lines:
7+
pygount --format=summary --folders-to-skip=target,data,__pycache__,.git --names-to-skip=tags,*.html
8+
9+
connect-redis:
10+
docker exec -it redis-stack redis-cli --askpass

Makefile

Lines changed: 0 additions & 6 deletions
This file was deleted.

examples/basic.rs

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ use capp::prelude::{
55
use capp::{
66
config::Configurable,
77
manager::{WorkersManager, WorkersManagerOptionsBuilder},
8-
storage::{InMemoryTaskStorage, Task, TaskStorage},
8+
queue::{AbstractTaskQueue, InMemoryTaskQueue, TaskQueue},
9+
task::Task,
910
};
1011
use serde::{Deserialize, Serialize};
1112
use std::{path, sync::Arc};
@@ -22,7 +23,6 @@ pub struct DivisionComputation;
2223

2324
#[derive(Debug)]
2425
pub struct Context {
25-
name: String,
2626
config: serde_yaml::Value,
2727
}
2828

@@ -36,7 +36,6 @@ impl Context {
3636
fn from_config(config_file_path: impl AsRef<path::Path>) -> Self {
3737
let config = Self::load_config(config_file_path);
3838
Self {
39-
name: "test-app".to_string(),
4039
config: config.unwrap(),
4140
}
4241
}
@@ -47,28 +46,26 @@ impl Computation<TaskData, Context> for DivisionComputation {
4746
/// TaskRunner will fail tasks which value can't be divided by 3
4847
async fn call(
4948
&self,
50-
_worker_id: WorkerId,
51-
ctx: Arc<Context>,
52-
_storage: Arc<dyn TaskStorage<TaskData> + Send + Sync>,
49+
worker_id: WorkerId,
50+
_ctx: Arc<Context>,
51+
_queue: AbstractTaskQueue<TaskData>,
5352
task: &mut Task<TaskData>,
5453
) -> Result<(), ComputationError> {
55-
// setup spans
56-
// let span = tracing::info_span!("computation", worker_id = %worker_id);
57-
// let _enter = span.enter();
58-
59-
tracing::info!("Task received to process: {:?}", task.get_payload());
54+
tracing::info!(
55+
"[{}] Test division task: {:?}",
56+
worker_id,
57+
task.get_payload()
58+
);
6059

6160
let rem = task.payload.value % 3;
6261
if rem != 0 {
63-
let err_msg = format!("Can't divide {} by 3", task.payload.value);
62+
let err_msg =
63+
format!("[{}] Can't divide {} by 3", worker_id, task.payload.value);
6464
tokio::time::sleep(tokio::time::Duration::from_secs(rem as u64)).await;
6565
return Err(ComputationError::Function(err_msg));
6666
};
6767

6868
task.payload.finished = true;
69-
if ctx.name == "test-app".to_string() {
70-
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
71-
}
7269
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
7370
Ok(())
7471
}
@@ -78,16 +75,16 @@ impl Computation<TaskData, Context> for DivisionComputation {
7875
/// For current set following conditions should be true:
7976
/// total tasks = 9
8077
/// number of failed tasks = 4
81-
async fn make_storage() -> impl TaskStorage<TaskData> + Send + Sync {
82-
let storage = InMemoryTaskStorage::new();
78+
async fn make_storage() -> impl TaskQueue<TaskData> + Send + Sync {
79+
let storage = InMemoryTaskQueue::new();
8380

8481
for i in 1..=5 {
8582
let task: Task<TaskData> = Task::new(TaskData {
8683
domain: "one".to_string(),
8784
value: i,
8885
finished: false,
8986
});
90-
let _ = storage.task_push(&task).await;
87+
let _ = storage.push(&task).await;
9188
}
9289

9390
for i in 1..=5 {
@@ -96,7 +93,7 @@ async fn make_storage() -> impl TaskStorage<TaskData> + Send + Sync {
9693
value: i * 3,
9794
finished: false,
9895
});
99-
let _ = storage.task_push(&task).await;
96+
let _ = storage.push(&task).await;
10097
}
10198

10299
for _ in 1..=10 {
@@ -105,7 +102,7 @@ async fn make_storage() -> impl TaskStorage<TaskData> + Send + Sync {
105102
value: 2,
106103
finished: false,
107104
});
108-
let _ = storage.task_push(&task).await;
105+
let _ = storage.push(&task).await;
109106
}
110107
storage
111108
}
File renamed without changes.

examples/hackernews/main.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,16 @@
22
use async_trait::async_trait;
33
use base64::{engine::general_purpose::URL_SAFE, Engine as _};
44
use capp::prelude::{
5-
Computation, ComputationError, InMemoryTaskStorage, Task, TaskStorage,
6-
WorkerId, WorkerOptionsBuilder, WorkersManager, WorkersManagerOptionsBuilder,
5+
Computation, ComputationError, InMemoryTaskQueue, Task, TaskQueue, WorkerId,
6+
WorkerOptionsBuilder, WorkersManager, WorkersManagerOptionsBuilder,
77
};
88
use capp::{config::Configurable, http, reqwest};
99
use capp::{tracing, tracing_subscriber};
10-
use once_cell::sync::Lazy;
1110
use rand::{seq::SliceRandom, thread_rng};
1211
use scraper::{Html, Selector};
1312
use serde::{Deserialize, Serialize};
1413
use std::io::Write;
14+
use std::sync::LazyLock;
1515
use std::{
1616
collections::HashSet,
1717
path,
@@ -21,8 +21,8 @@ use url::{ParseError, Url};
2121

2222
const SEED_URLS: [&str; 1] = ["https://news.ycombinator.com"];
2323

24-
static URL_SET: Lazy<Arc<Mutex<HashSet<String>>>> = Lazy::new(|| {
25-
let mut set: HashSet<String> = HashSet::new();
24+
static URL_SET: LazyLock<Mutex<HashSet<String>>> = LazyLock::new(|| {
25+
let mut set = HashSet::new();
2626
// Add some urls we do not want to add into queue
2727
set.insert("https://news.ycombinator.com/submit".into());
2828
set.insert("https://news.ycombinator.com/jobs".into());
@@ -31,7 +31,7 @@ static URL_SET: Lazy<Arc<Mutex<HashSet<String>>>> = Lazy::new(|| {
3131
set.insert("https://news.ycombinator.com/newcomments".into());
3232
set.insert("https://news.ycombinator.com/front".into());
3333
set.insert("https://news.ycombinator.com/newest".into());
34-
Arc::new(Mutex::new(set))
34+
Mutex::new(set)
3535
});
3636

3737
#[derive(Debug)]
@@ -99,7 +99,7 @@ impl Computation<SiteLink, Context> for HNCrawler {
9999
&self,
100100
worker_id: WorkerId,
101101
ctx: Arc<Context>,
102-
storage: Arc<dyn TaskStorage<SiteLink> + Send + Sync + 'static>,
102+
storage: Arc<dyn TaskQueue<SiteLink> + Send + Sync + 'static>,
103103
task: &mut Task<SiteLink>,
104104
) -> Result<(), ComputationError> {
105105
tracing::info!("[worker-{}] Processing task: {:?}", worker_id, task);
@@ -189,13 +189,13 @@ impl HNCrawler {
189189
// Store links to website for further crawling
190190
async fn store_links_website(
191191
links: Vec<Url>,
192-
storage: Arc<dyn TaskStorage<SiteLink> + Send + Sync>,
192+
storage: Arc<dyn TaskQueue<SiteLink> + Send + Sync>,
193193
) -> Result<usize, anyhow::Error> {
194194
let mut links_stored = 0;
195195
tracing::info!("Adding {} links to the queue...", links.len());
196196

197197
for link in links.iter() {
198-
let link_str = link.as_str().to_string();
198+
let link_str = link.as_str().to_owned();
199199

200200
let should_store = {
201201
// Scoped lock acquisition
@@ -205,7 +205,7 @@ impl HNCrawler {
205205

206206
if should_store {
207207
let link_data = SiteLink { url: link_str };
208-
storage.task_push(&Task::new(link_data)).await?;
208+
storage.push(&Task::new(link_data)).await?;
209209
links_stored += 1;
210210
}
211211
}
@@ -353,7 +353,7 @@ async fn main() {
353353
.build()
354354
.unwrap();
355355

356-
let storage: InMemoryTaskStorage<SiteLink> = InMemoryTaskStorage::new();
356+
let storage: InMemoryTaskQueue<SiteLink> = InMemoryTaskQueue::new();
357357
let tasks_queue_len = storage.list.lock().unwrap().len();
358358

359359
tracing::info!("Website links tasks in queue: {}", tasks_queue_len);
@@ -362,7 +362,7 @@ async fn main() {
362362
tracing::warn!("Queue is empty! Seeding urls... {}", SEED_URLS.join(" "));
363363
for url in SEED_URLS.iter() {
364364
let initial_task = Task::new(SiteLink::new(url));
365-
let _ = storage.task_push(&initial_task).await;
365+
let _ = storage.push(&initial_task).await;
366366
}
367367
}
368368

src/healthcheck.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use reqwest::{Client, StatusCode};
2+
use serde_json::Value;
23
use tokio::time::{timeout, Duration};
34

45
// const GOOGLE: &str = "http://www.google.com";
@@ -37,3 +38,34 @@ pub async fn internet(http_url: &str) -> bool {
3738
);
3839
false
3940
}
41+
42+
pub async fn test_proxy(proxy_url: &str) -> bool {
43+
let client = Client::new();
44+
let proxy_client = Client::builder()
45+
.proxy(reqwest::Proxy::all(proxy_url).unwrap())
46+
.build()
47+
.unwrap();
48+
49+
let ip_check_url = "https://httpbin.org/ip";
50+
51+
// Get local IP
52+
let local_ip = match get_ip(&client, ip_check_url).await {
53+
Ok(ip) => ip,
54+
Err(_) => return false,
55+
};
56+
57+
// Get IP through proxy
58+
let proxy_ip = match get_ip(&proxy_client, ip_check_url).await {
59+
Ok(ip) => ip,
60+
Err(_) => return false,
61+
};
62+
63+
// Compare IPs
64+
local_ip != proxy_ip
65+
}
66+
67+
async fn get_ip(client: &Client, url: &str) -> Result<String, reqwest::Error> {
68+
let response = client.get(url).send().await?;
69+
let body: Value = response.json().await?;
70+
Ok(body["origin"].as_str().unwrap_or("").to_string())
71+
}

src/lib.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ pub mod healthcheck;
55
pub mod http;
66
pub mod manager;
77
pub mod prelude;
8-
pub mod storage;
8+
pub mod queue;
9+
pub mod router;
10+
pub mod task;
911
// #[cfg(test)]
1012
// mod support;
1113
// #[cfg(test)]
@@ -16,6 +18,8 @@ pub use async_trait;
1618
#[cfg(feature = "http")]
1719
pub use backoff;
1820
pub use derive_builder;
21+
pub use indexmap;
22+
pub use regex;
1923
#[cfg(feature = "http")]
2024
pub use reqwest;
2125
#[cfg(feature = "redis")]
@@ -26,4 +30,5 @@ pub use serde_yaml;
2630
pub use thiserror;
2731
pub use tracing;
2832
pub use tracing_subscriber;
33+
pub use url;
2934
pub use uuid;

0 commit comments

Comments
 (0)