Skip to content

Commit 7ddb59d

Browse files
committed
fix: upgrading to the latest versions
1 parent 3b6faf4 commit 7ddb59d

File tree

9 files changed

+122
-152
lines changed

9 files changed

+122
-152
lines changed

Cargo.toml

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "pschema-rs"
3-
version = "0.0.3"
4-
authors = [ "Ángel Iglesias Préstamo <[email protected]>" ]
3+
version = "0.0.4"
4+
authors = ["Ángel Iglesias Préstamo <[email protected]>"]
55
description = "Pregel-based schema validation algorithm written in Rust for generating Wikidata subsets"
66
documentation = "https://docs.rs/crate/pschema-rs/latest"
77
repository = "https://github.com/angelip2303/pschema-rs"
@@ -12,14 +12,24 @@ keywords = ["pregel", "wikidata", "subsetting", "duckdb", "validation"]
1212
categories = ["algorithms", "database", "mathematics", "science"]
1313

1414
[dependencies]
15-
pregel-rs = { version = "0.0.13" }
16-
wikidata-rs = { version = "0.0.4" }
17-
polars = { version = "0.30.0", features = ["lazy", "is_in", "performant", "parquet", "chunked_ids", "list_eval", "dtype-categorical", "rows", "is_first"] }
18-
duckdb = { version = "0.7.1" }
15+
pregel-rs = { path = "../pregel-rs" }
16+
wikidata-rs = { path = "../wd2duckdb/wikidata-rs" }
17+
polars = { version = "0.45.1", features = [
18+
"lazy",
19+
"is_in",
20+
"performant",
21+
"parquet",
22+
"chunked_ids",
23+
"list_eval",
24+
"dtype-categorical",
25+
"rows",
26+
"is_first_distinct",
27+
] }
28+
duckdb = { version = "1.1.1" }
1929
rayon = "1.7.0"
20-
wikidata = "0.3.0"
21-
strum = "0.24.1"
22-
strum_macros = "0.24"
30+
wikidata = "1.1.0"
31+
strum = "0.26.3"
32+
strum_macros = "0.26.4"
2333
bimap = "0.6.3"
2434
rio_turtle = "0.8.4"
2535
rio_api = "0.8.4"
@@ -28,12 +38,12 @@ rio_api = "0.8.4"
2838
jemallocator = "0.5.0"
2939

3040
[target.'cfg(target_env = "msvc")'.dependencies]
31-
mimalloc = { version = "0.1.37", default-features = false }
41+
mimalloc = { version = "0.1.43", default-features = false }
3242

3343
[dev-dependencies]
34-
duckdb = { version="0.7.1", features=["bundled"] }
44+
duckdb = { version = "1.1.1", features = ["bundled"] }
3545

3646
[profile.release]
3747
codegen-units = 1
3848
opt-level = 3
39-
lto = "thin"
49+
lto = "thin"

README.md

Lines changed: 16 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -5,98 +5,42 @@
55
[![latest_version](https://img.shields.io/crates/v/pschema-rs)](https://crates.io/crates/pschema-rs)
66
[![documentation](https://img.shields.io/docsrs/pschema-rs/latest)](https://docs.rs/pschema-rs/latest/pschema_rs/)
77

8-
`pschema-rs` is a Rust library that provides a Pregel-based schema validation algorithm for generating subsets of data
8+
`pschema-rs` is a Rust library that provides a Pregel-based schema validation algorithm for generating subsets of data
99
from Wikidata. It is designed to be efficient, scalable, and easy to use, making it suitable for a wide range of applications
1010
that involve processing large amounts of data from Wikidata.
1111

1212
## Features
1313

14-
- **Pregel-based schema validation**: `pschema-rs` uses the Pregel model, a graph-based computation model, to perform
15-
schema validation on Wikidata entities. This allows for efficient and scalable processing of large datasets.
14+
- **Pregel-based schema validation**: `pschema-rs` uses the Pregel model, a graph-based computation model, to perform
15+
schema validation on Wikidata entities. This allows for efficient and scalable processing of large datasets.
1616

1717
- **Rust implementation**: `pschema-rs` is implemented in Rust, a systems programming language known for its performance,
18-
memory safety, and concurrency features. This ensures that the library is fast, reliable, and safe to use.
18+
memory safety, and concurrency features. This ensures that the library is fast, reliable, and safe to use.
1919

20-
- **Wikidata subset generation**: `pschema-rs` provides functionality to generate subsets of data from Wikidata based on
21-
schema validation rules. This allows users to filter and extract relevant data from Wikidata based on their specific
22-
requirements.
20+
- **Wikidata subset generation**: `pschema-rs` provides functionality to generate subsets of data from Wikidata based on
21+
schema validation rules. This allows users to filter and extract relevant data from Wikidata based on their specific
22+
requirements.
2323

24-
- **Customizable validation rules**: `pschema-rs` allows users to define their own validation rules using a simple and
25-
flexible syntax. This makes it easy to customize the schema validation process according to the specific needs of a given
26-
application.
24+
- **Customizable validation rules**: `pschema-rs` allows users to define their own validation rules using a simple and
25+
flexible syntax. This makes it easy to customize the schema validation process according to the specific needs of a given
26+
application.
2727

2828
- **Easy-to-use API**: `pschema-rs` provides a user-friendly API that makes it easy to integrate the library into any Rust
29-
project. The API provides a high-level interface for performing schema validation and generating Wikidata subsets, with
30-
comprehensive documentation and examples to help users get started quickly.
29+
project. The API provides a high-level interface for performing schema validation and generating Wikidata subsets, with
30+
comprehensive documentation and examples to help users get started quickly.
3131

3232
## Installation
3333

3434
To use `pschema-rs` in your Rust project, you can add it as a dependency in your `Cargo.toml` file:
3535

3636
```toml
3737
[dependencies]
38-
pschema = "0.0.2"
38+
pschema = "0.0.4"
3939
```
4040

4141
## Usage
4242

43-
Here's an example of how you can use `pschema-rs` to perform schema validation and generate a subset of data from Wikidata.
44-
Note that what we are doing here is first, defining the `ShapeExpression` we want the algorithm to validate. Next, we import
45-
the Wikidata entities from a file. Note that the import methods we have defined create an edge DataFrame, and as such, we
46-
need to call to the function `GraphFrame::from_edges(edges)`, which will build the GraphFrame from the imported edges. Lastly,
47-
by calling `PSchema::new(start).validate(graph)`, we will both construct the `PSchema` algorithm provided the `ShapeExpression`
48-
we have defined, first, and create the subset of the graph, second. Then, we print the results. Note that we can also export
49-
the results to a file. See the [examples](https://github.com/angelip2303/pschema-rs/tree/main/examples) for more information.
50-
51-
```rust
52-
use pregel_rs::graph_frame::GraphFrame;
53-
use pschema_rs::backends::duckdb::DuckDB;
54-
use pschema_rs::backends::Backend;
55-
use pschema_rs::pschema::PSchema;
56-
use pschema_rs::shape::shex::Shape;
57-
use pschema_rs::shape::shex::NodeConstraint;
58-
use pschema_rs::shape::shex::TripleConstraint;
59-
use wikidata_rs::id::Id;
60-
61-
fn main() -> Result<(), String> {
62-
// Define validation rules
63-
let start = Shape::TripleConstraint(TripleConstraint::new(
64-
"City",
65-
u32::from(Id::from("P31")),
66-
NodeConstraint::Value(u32::from(Id::from("Q515"))),
67-
));
68-
69-
// Load Wikidata entities
70-
let edges = DuckDB::import("./examples/from_duckdb/3000lines.duckdb")?;
71-
72-
// Perform schema validation
73-
match GraphFrame::from_edges(edges) {
74-
Ok(graph) => match PSchema::new(start).validate(graph) {
75-
Ok(result) => {
76-
println!("Schema validation result:");
77-
println!("{:?}", result);
78-
Ok(())
79-
}
80-
Err(error) => Err(error.to_string()),
81-
},
82-
Err(error) => Err(format!("Cannot create a GraphFrame: {}", error)),
83-
}
84-
}
85-
86-
```
87-
88-
You could also run one of the examples to check how this library works:
89-
90-
```sh
91-
cargo build
92-
cargo run --example from_duckdb
93-
```
94-
95-
Or follow the guidelines explained in [examples/from_uniprot](https://github.com/angelip2303/pschema-rs/tree/main/examples/from_uniprot)
96-
where a more detailed use-case is shown.
97-
98-
For more information on how to define validation rules, load entities from Wikidata, and process subsets of data, refer
99-
to the documentation.
43+
TBD
10044

10145
## Related projects
10246

@@ -114,11 +58,11 @@ the Free Software Foundation, either version 3 of the License, or
11458

11559
This program is distributed in the hope that it will be useful,
11660
but WITHOUT ANY WARRANTY; without even the implied warranty of
117-
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
61+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11862
GNU General Public License for more details.
11963

12064
You should have received a copy of the GNU General Public License
121-
along with this program. If not, see <https://www.gnu.org/licenses/>.
65+
along with this program. If not, see <https://www.gnu.org/licenses/>.
12266

12367
**By contributing to this project, you agree to release your
12468
contributions under the same license.**

src/backends/duckdb.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ impl Backend for DuckDB {
3535
let format = |id: DataType| {
3636
format!(
3737
"SELECT src_id, property_id, CAST({:} AS UINTEGER) FROM {:}",
38-
u32::from(Id::DataType(id.to_owned())),
38+
u32::from(Id::DataType(id.clone())),
3939
id.as_ref()
4040
)
4141
};
@@ -77,35 +77,35 @@ impl Backend for DuckDB {
7777
.map(|batch| {
7878
match DataFrame::new(vec![
7979
Series::new(
80-
Column::Subject.as_ref(),
81-
// because we know that the first column is the src_id
80+
Column::Subject.as_ptr(),
8281
batch
8382
.column(0)
8483
.as_any()
8584
.downcast_ref::<UInt32Array>()
8685
.unwrap()
8786
.values(),
88-
),
87+
)
88+
.into(),
8989
Series::new(
90-
Column::Predicate.as_ref(),
91-
// because we know that the second column is the property_id
90+
Column::Predicate.as_ptr(),
9291
batch
9392
.column(1)
9493
.as_any()
9594
.downcast_ref::<UInt32Array>()
9695
.unwrap()
9796
.values(),
98-
),
97+
)
98+
.into(),
9999
Series::new(
100-
Column::Object.as_ref(),
101-
// because we know that the third column is the dst_id
100+
Column::Object.as_ptr(),
102101
batch
103102
.column(2)
104103
.as_any()
105104
.downcast_ref::<UInt32Array>()
106105
.unwrap()
107106
.values(),
108-
),
107+
)
108+
.into(),
109109
]) {
110110
Ok(tmp_dataframe) => tmp_dataframe,
111111
Err(_) => DataFrame::empty(),

src/backends/ntriples.rs

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pub struct NTriples;
1818

1919
impl Backend for NTriples {
2020
fn import(path: &str) -> Result<DataFrame, String> {
21-
enable_string_cache(true);
21+
enable_string_cache();
2222

2323
let mut subjects = Vec::<String>::new();
2424
let mut predicates = Vec::<String>::new();
@@ -41,15 +41,14 @@ impl Backend for NTriples {
4141

4242
while !parser.is_end() {
4343
if parser.parse_step(&mut on_triple).is_err() {
44-
// We skip the line if it is not a valid triple
4544
continue;
4645
}
4746
}
4847

4948
match df![
50-
Column::Subject.as_ref() => Series::new(Column::Subject.as_ref(), subjects).cast(&DataType::Categorical(None)).unwrap(),
51-
Column::Predicate.as_ref() => Series::new(Column::Predicate.as_ref(), predicates).cast(&DataType::Categorical(None)).unwrap(),
52-
Column::Object.as_ref() => Series::new(Column::Object.as_ref(), objects).cast(&DataType::Categorical(None)).unwrap(),
49+
Column::Subject.as_ref() => Series::new(Column::Subject.as_ptr(), subjects).cast(&DataType::Categorical(None, CategoricalOrdering::Lexical)).unwrap(),
50+
Column::Predicate.as_ref() => Series::new(Column::Predicate.as_ptr(), predicates).cast(&DataType::Categorical(None, CategoricalOrdering::Lexical)).unwrap(),
51+
Column::Object.as_ref() => Series::new(Column::Object.as_ptr(), objects).cast(&DataType::Categorical(None, CategoricalOrdering::Lexical)).unwrap(),
5352
] {
5453
Ok(edges) => Ok(edges),
5554
Err(_) => Err(String::from("Error creating the edges DataFrame")),
@@ -62,12 +61,12 @@ impl Backend for NTriples {
6261
let mut formatter = NTriplesFormatter::new(writer);
6362

6463
let df = df
65-
.to_owned()
64+
.clone()
6665
.lazy()
6766
.select([
68-
col(Column::Subject.as_ref()).cast(DataType::Utf8),
69-
col(Column::Predicate.as_ref()).cast(DataType::Utf8),
70-
col(Column::Object.as_ref()).cast(DataType::Utf8),
67+
col(Column::Subject.as_ref()).cast(DataType::String),
68+
col(Column::Predicate.as_ref()).cast(DataType::String),
69+
col(Column::Object.as_ref()).cast(DataType::String),
7170
])
7271
.collect()
7372
.unwrap();
@@ -82,7 +81,7 @@ impl Backend for NTriples {
8281
.format(&Triple {
8382
subject: match row.get(0) {
8483
Some(subject) => match subject {
85-
AnyValue::Utf8(iri) => NamedNode {
84+
AnyValue::String(iri) => NamedNode {
8685
iri: &iri[1..iri.len() - 1],
8786
}
8887
.into(),
@@ -96,7 +95,7 @@ impl Backend for NTriples {
9695
},
9796
predicate: match row.get(1) {
9897
Some(predicate) => match predicate {
99-
AnyValue::Utf8(iri) => NamedNode {
98+
AnyValue::String(iri) => NamedNode {
10099
iri: &iri[1..iri.len() - 1],
101100
},
102101
_ => {
@@ -109,7 +108,7 @@ impl Backend for NTriples {
109108
},
110109
object: match row.get(2) {
111110
Some(object) => match object {
112-
AnyValue::Utf8(iri) => {
111+
AnyValue::String(iri) => {
113112
if iri.contains("^^") {
114113
let v: Vec<_> = iri.split("^^").collect();
115114
Literal::Typed {

0 commit comments

Comments
 (0)