Skip to content

Codify and Resolve modifiers #46

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jun 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,13 @@ use std::fmt::Write;
use std::iter::Peekable;
use std::path::Path;

use self::shared::ModifierSet;

type StrResult<T> = Result<T, String>;

#[path = "src/shared.rs"]
mod shared;

/// A module of definitions.
struct Module<'a>(Vec<(&'a str, Binding<'a>)>);

Expand All @@ -29,7 +34,7 @@ enum Def<'a> {
/// A symbol, either a leaf or with modifiers.
enum Symbol<'a> {
Single(char),
Multi(Vec<(&'a str, char)>),
Multi(Vec<(ModifierSet<&'a str>, char)>),
}

/// A single line during parsing.
Expand All @@ -40,7 +45,7 @@ enum Line<'a> {
ModuleStart(&'a str),
ModuleEnd,
Symbol(&'a str, Option<char>),
Variant(&'a str, char),
Variant(ModifierSet<&'a str>, char),
}

fn main() {
Expand Down Expand Up @@ -79,7 +84,7 @@ fn process(buf: &mut String, file: &Path, name: &str, desc: &str) {

write!(buf, "#[doc = {desc:?}] pub const {name}: Module = ").unwrap();
encode(buf, &module);
buf.push_str(";");
buf.push(';');
}

/// Tokenizes and classifies a line.
Expand Down Expand Up @@ -110,7 +115,7 @@ fn tokenize(line: &str) -> StrResult<Line> {
validate_ident(part)?;
}
let c = decode_char(tail.ok_or("missing char")?)?;
Line::Variant(rest, c)
Line::Variant(ModifierSet::from_raw_dotted(rest), c)
} else {
validate_ident(head)?;
let c = tail.map(decode_char).transpose()?;
Expand Down Expand Up @@ -165,9 +170,9 @@ fn parse<'a>(
p.next();
}

let symbol = if variants.len() > 0 {
let symbol = if !variants.is_empty() {
if let Some(c) = c {
variants.insert(0, ("", c));
variants.insert(0, (ModifierSet::default(), c));
}
Symbol::Multi(variants)
} else {
Expand Down Expand Up @@ -204,15 +209,15 @@ fn encode(buf: &mut String, module: &Module) {
Def::Module(module) => {
buf.push_str("Def::Module(");
encode(buf, module);
buf.push_str(")");
buf.push(')');
}
Def::Symbol(symbol) => {
buf.push_str("Def::Symbol(Symbol::");
match symbol {
Symbol::Single(c) => write!(buf, "Single({c:?})").unwrap(),
Symbol::Multi(list) => write!(buf, "Multi(&{list:?})").unwrap(),
}
buf.push_str(")");
buf.push(')');
}
}
write!(buf, ", deprecation: {:?} }}),", entry.deprecation).unwrap();
Expand Down
51 changes: 47 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
/*!
Human-friendly notation for Unicode symbols.
*/
//! Human-friendly notation for Unicode symbols.
//!
//! ## Model
//! A [`Symbol`] is a collection of one or more _variants_. Each variant is
//! identified by a set of [_modifiers_](ModifierSet) and has a single character
//! as its value. The modifiers themselves can in principle be any non-empty
//! strings that don't contain the character `.`, but codex only defines ones
//! that are entirely made of ASCII alphabetical characters.

pub use self::shared::ModifierSet;

mod shared;

/// A module of definitions.
#[derive(Debug, Copy, Clone)]
Expand Down Expand Up @@ -52,7 +61,41 @@ pub enum Symbol {
/// A symbol without modifiers.
Single(char),
/// A symbol with named modifiers. The symbol defaults to its first variant.
Multi(&'static [(&'static str, char)]),
Multi(&'static [(ModifierSet<&'static str>, char)]),
}

impl Symbol {
/// Get the symbol's character for a given set of modifiers.
pub fn get(&self, modifs: ModifierSet<&str>) -> Option<char> {
match self {
Self::Single(c) => modifs.is_empty().then_some(*c),
Self::Multi(list) => modifs.best_match_in(list.iter().copied()),
}
}

/// The characters that are covered by this symbol.
pub fn variants(&self) -> impl Iterator<Item = (ModifierSet<&str>, char)> {
enum Variants {
Single(std::iter::Once<char>),
Multi(std::slice::Iter<'static, (ModifierSet<&'static str>, char)>),
}
let mut iter = match self {
Self::Single(c) => Variants::Single(std::iter::once(*c)),
Self::Multi(sl) => Variants::Multi(sl.iter()),
};
std::iter::from_fn(move || match &mut iter {
Variants::Single(iter) => Some((ModifierSet::default(), iter.next()?)),
Variants::Multi(iter) => iter.next().copied(),
})
}

/// Possible modifiers for this symbol.
pub fn modifiers(&self) -> impl Iterator<Item = &str> + '_ {
self.variants()
.flat_map(|(m, _)| m.into_iter())
.collect::<std::collections::BTreeSet<_>>()
.into_iter()
}
}

/// A module that contains the other top-level modules.
Expand Down
230 changes: 230 additions & 0 deletions src/shared.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
use std::ops::Deref;

/// A set of modifiers.
///
/// Beware: The [`Eq`] and [`Hash`] implementations are dependent on the
/// ordering of the modifiers, in opposition to what a set would usually
/// constitute. To test for set-wise equality, use [`iter`](Self::iter) and
/// collect into a true set type like [`HashSet`](std::collections::HashSet).
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub struct ModifierSet<S>(
// Note: the visibility needs to be `pub(crate)`, since build.rs outputs
// `ModifierSet(...)`.
pub(crate) S,
);

impl<S: Deref<Target = str>> ModifierSet<S> {
/// Constructs a modifier set from a string, where modifiers are separated
/// by the character `.`.
///
/// `s` should not contain any empty modifiers (i.e. it shouldn't contain
/// the sequence `..`) and no modifier should occur twice. Otherwise,
/// unexpected errors can occur.
pub fn from_raw_dotted(s: S) -> Self {
// Checking the other requirement too feels like it would be a bit too
// expensive, even for debug mode.
debug_assert!(
!s.contains(".."),
"ModifierSet::from_dotted called with string containing empty modifier"
);
Self(s)
}

/// Whether `self` is empty.
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}

/// Gets the string of modifiers separated by `.`.
pub fn as_str(&self) -> &str {
&self.0
}

/// Converts the underlying string to a slice.
pub fn as_deref(&self) -> ModifierSet<&str> {
ModifierSet(&self.0)
}

/// Inserts a new modifier into the set.
///
/// `m` should not be empty, contain the character `.`, or already be in the
/// set. Otherwise, unexpected errors can occur.
pub fn insert_raw(&mut self, m: &str)
where
S: for<'a> std::ops::AddAssign<&'a str>,
{
if !self.0.is_empty() {
self.0 += ".";
}
self.0 += m;
}

/// Iterates over the list of modifiers in an arbitrary order.
pub fn iter(&self) -> impl Iterator<Item = &str> {
self.into_iter()
}

/// Whether the set contains the modifier `m`.
pub fn contains(&self, m: &str) -> bool {
self.iter().any(|lhs| lhs == m)
}

/// Finds the best match from the list.
///
/// To be considered a match, the modifier set must be a superset of (or
/// equal to) `self`. Among different matches, the best one is selected by
/// the following two criteria (in order):
/// 1. Number of modifiers in common with `self` (more is better).
/// 2. Total number of modifiers (fewer is better).
///
/// If there are multiple best matches, the first of them is returned.
pub fn best_match_in<'a, T>(
&self,
variants: impl Iterator<Item = (ModifierSet<&'a str>, T)>,
) -> Option<T> {
let mut best = None;
let mut best_score = None;

// Find the best table entry with this name.
for candidate in variants.filter(|(set, _)| self.is_subset(*set)) {
let mut matching = 0;
let mut total = 0;
for modifier in candidate.0.iter() {
if self.contains(modifier) {
matching += 1;
}
total += 1;
}

let score = (matching, std::cmp::Reverse(total));
if best_score.is_none_or(|b| score > b) {
best = Some(candidate.1);
best_score = Some(score);
}
}

best
}

/// Whether all modifiers in `self` are also present in `other`.
pub fn is_subset(&self, other: ModifierSet<&str>) -> bool {
self.iter().all(|m| other.contains(m))
}
}

impl<S: Default> Default for ModifierSet<S> {
/// Constructs the default modifier set.
///
/// This is typically the empty set, though the remark from
/// [`Self::from_raw_dotted`] applies since `S::default()` could technically
/// be anything.
fn default() -> Self {
Self(S::default())
}
}

impl<'a, S: Deref<Target = str>> IntoIterator for &'a ModifierSet<S> {
type Item = &'a str;
type IntoIter = std::str::Split<'a, char>;

/// Iterate over the list of modifiers in an arbitrary order.
fn into_iter(self) -> Self::IntoIter {
let mut iter = self.0.split('.');
if self.0.is_empty() {
// Empty the iterator
let _ = iter.next();
}
iter
}
}

impl<'a> IntoIterator for ModifierSet<&'a str> {
type Item = &'a str;
type IntoIter = std::str::Split<'a, char>;

/// Iterate over the list of modifiers in an arbitrary order.
fn into_iter(self) -> Self::IntoIter {
let mut iter = self.0.split('.');
if self.0.is_empty() {
// Empty the iterator
let _ = iter.next();
}
iter
}
}

#[cfg(test)]
mod tests {
type ModifierSet = super::ModifierSet<&'static str>;

#[test]
fn default_is_empty() {
assert!(ModifierSet::default().is_empty());
}

#[test]
fn iter_count() {
assert_eq!(ModifierSet::default().iter().count(), 0);
assert_eq!(ModifierSet::from_raw_dotted("a").iter().count(), 1);
assert_eq!(ModifierSet::from_raw_dotted("a.b").iter().count(), 2);
assert_eq!(ModifierSet::from_raw_dotted("a.b.c").iter().count(), 3);
}

#[test]
fn subset() {
assert!(ModifierSet::from_raw_dotted("a")
.is_subset(ModifierSet::from_raw_dotted("a.b")));
assert!(ModifierSet::from_raw_dotted("a")
.is_subset(ModifierSet::from_raw_dotted("b.a")));
assert!(ModifierSet::from_raw_dotted("a.b")
.is_subset(ModifierSet::from_raw_dotted("b.c.a")));
}

#[test]
fn best_match() {
// 1. more modifiers in common with self
assert_eq!(
ModifierSet::from_raw_dotted("a.b").best_match_in(
[
(ModifierSet::from_raw_dotted("a.c"), 1),
(ModifierSet::from_raw_dotted("a.b"), 2),
]
.into_iter()
),
Some(2)
);
// 2. fewer modifiers in general
assert_eq!(
ModifierSet::from_raw_dotted("a").best_match_in(
[
(ModifierSet::from_raw_dotted("a"), 1),
(ModifierSet::from_raw_dotted("a.b"), 2),
]
.into_iter()
),
Some(1)
);
// the first rule takes priority over the second
assert_eq!(
ModifierSet::from_raw_dotted("a.b").best_match_in(
[
(ModifierSet::from_raw_dotted("a"), 1),
(ModifierSet::from_raw_dotted("a.b"), 2),
]
.into_iter()
),
Some(2)
);
// among multiple best matches, the first one is returned
assert_eq!(
ModifierSet::default().best_match_in(
[
(ModifierSet::from_raw_dotted("a"), 1),
(ModifierSet::from_raw_dotted("b"), 2)
]
.into_iter()
),
Some(1)
);
}
}