Skip to content

Commit

Permalink
feat(miniscrapy): add engine
Browse files Browse the repository at this point in the history
  • Loading branch information
cathaysia committed Nov 24, 2023
1 parent d22763c commit a2fdd0c
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 49 deletions.
2 changes: 2 additions & 0 deletions miniscrapy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ edition = "2021"
reqwest = "0.11.22"
async-trait = "0.1.74"
serde_json = "1.0.108"
tokio = { version = "1.34.0", features = ["full"] }
parking_lot = "0.12.1"
43 changes: 43 additions & 0 deletions miniscrapy/src/engine.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
use crate::*;

use tokio::sync::Mutex;

pub struct Engine {
spiders: Mutex<Vec<Box<dyn Spider>>>,
downloader_middleware: Mutex<Vec<Box<dyn DownloaderMiddleware>>>,
item_middleware: Mutex<Vec<Box<dyn ItemMiddleware>>>,
item_pipeline: Mutex<Vec<Box<dyn ItemPipeline>>>,
}

impl Engine {
pub async fn add_spider(&self, spider: Box<dyn Spider>) {
self.spiders.lock().await.push(spider);
}

pub async fn append_downloader_middleware(&self, middleware: Box<dyn DownloaderMiddleware>) {
self.downloader_middleware.lock().await.push(middleware);
}

pub async fn append_item_middleware(&self, middleware: Box<dyn ItemMiddleware>) {
self.item_middleware.lock().await.push(middleware);
}

pub async fn append_item_pipeline(&self, pipeline: Box<dyn ItemPipeline>) {
self.item_pipeline.lock().await.push(pipeline);
}

pub async fn start(&self) {
let spiders: Vec<Box<dyn Spider>> = std::mem::take(self.spiders.lock().await.as_mut());

for spider in spiders {
loop {
let next = spider.next_request().await;
if let Ok(val) = next {
let Some(rep) = val else {
break;
};
}
}
}
}
}
4 changes: 4 additions & 0 deletions miniscrapy/src/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pub type MiniScrapyResult<T> = Result<T, MiniScrapyError>;

#[derive(Debug)]
pub enum MiniScrapyError {}
55 changes: 6 additions & 49 deletions miniscrapy/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,50 +1,7 @@
use std::sync::Arc;
mod engine;
mod error;
mod traits;

use async_trait::async_trait;
use reqwest::{Request, Response};
use serde_json::Value;

#[async_trait]
pub trait Spider {
async fn next(&self) -> Option<Request>;
async fn parse(&self, response: Response) -> Option<Value>;
}

pub enum DownloaderMiddlewareReturn {
None,
Ignore,
Request(Request),
Response(Response),
}

#[async_trait]
pub trait DownloaderMiddleware {
async fn process_request(
&self,
request: Request,
_spider: &dyn Spider,
) -> DownloaderMiddlewareReturn {
DownloaderMiddlewareReturn::Request(request)
}
async fn process_response(
&self,
response: Response,
_spider: &dyn Spider,
) -> DownloaderMiddlewareReturn {
DownloaderMiddlewareReturn::Response(response)
}
}

#[async_trait]
pub trait ItemMiddleware {
async fn process_item(&self, item: Value, spider: &dyn Spider);
async fn open_spider(&self, _spider: &dyn Spider) {}
async fn close_spider(&self, _spider: &dyn Spider) {}
}

#[async_trait]
pub trait Pipeline {
async fn open_spider(&self, spider: Arc<dyn Spider>);
async fn process_item(&self, item: Value);
async fn close_spider(&self);
}
pub use engine::*;
pub use error::*;
pub use traits::*;
51 changes: 51 additions & 0 deletions miniscrapy/src/traits.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use async_trait::async_trait;
use reqwest::{Request, Response};
use serde_json::Value;

use crate::MiniScrapyResult;

#[async_trait]
pub trait Spider {
fn name(&self) -> &str;
async fn next_request(&self) -> MiniScrapyResult<Option<Request>>;
async fn parse(&self, response: Response) -> Option<Value>;
}

pub enum DownloaderMiddlewareReturn {
None,
Ignore,
Request(Request),
Response(Response),
}

#[async_trait]
pub trait DownloaderMiddleware {
async fn process_request(
&self,
request: Request,
_spider: &dyn Spider,
) -> DownloaderMiddlewareReturn {
DownloaderMiddlewareReturn::Request(request)
}
async fn process_response(
&self,
response: Response,
_spider: &dyn Spider,
) -> DownloaderMiddlewareReturn {
DownloaderMiddlewareReturn::Response(response)
}
}

#[async_trait]
pub trait ItemMiddleware {
async fn process_item(&self, item: Value, spider: &dyn Spider);
async fn open_spider(&self, _spider: &dyn Spider) {}
async fn close_spider(&self, _spider: &dyn Spider) {}
}

#[async_trait]
pub trait ItemPipeline {
async fn open_spider(&self, spider: &dyn Spider);
async fn process_item(&self, item: Value, spider: &dyn Spider);
async fn close_spider(&self, spider: &dyn Spider);
}

0 comments on commit a2fdd0c

Please sign in to comment.