Skip to content

Commit 8f7d424

Browse files
committed
Init
0 parents  commit 8f7d424

File tree

11 files changed

+3722
-0
lines changed

11 files changed

+3722
-0
lines changed

.github/workflows/master.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: CI (master)
2+
3+
on:
4+
push:
5+
branches: [ master ]
6+
pull_request:
7+
branches: [ master ]
8+
9+
env:
10+
CARGO_TERM_COLOR: always
11+
12+
jobs:
13+
build:
14+
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
- uses: actions/checkout@v2
19+
- name: Build
20+
run: cargo build --verbose
21+
- name: Run tests
22+
run: cargo test --verbose

.github/workflows/release.yml

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
name: CI (Release)
2+
3+
on:
4+
push:
5+
tags:
6+
- 'v*'
7+
8+
env:
9+
CARGO_TERM_COLOR: always
10+
11+
jobs:
12+
test-again:
13+
runs-on: ${{ matrix.os }}
14+
strategy:
15+
matrix:
16+
os:
17+
- ubuntu-latest
18+
- macos-latest
19+
- windows-latest
20+
steps:
21+
- uses: actions/checkout@v2
22+
- uses: actions-rs/toolchain@v1
23+
with:
24+
toolchain: stable
25+
- uses: actions/cache@v2
26+
with:
27+
path: |
28+
~/.cargo/registry
29+
~/.cargo/git
30+
target/release
31+
target/debug
32+
target/.rustc_info.json
33+
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
34+
- name: Test
35+
uses: actions-rs/cargo@v1
36+
with:
37+
command: test
38+
args: --verbose --all-features
39+
release:
40+
runs-on: ubuntu-latest
41+
needs:
42+
- test-again
43+
steps:
44+
- uses: actions/checkout@v2
45+
- name: Create a Release
46+
id: create_release
47+
uses: actions/create-release@v1
48+
env:
49+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
50+
with:
51+
# The name of the tag. This should come from the webhook payload, `github.GITHUB_REF` when a user pushes a new tag
52+
tag_name: ${{ github.ref }}
53+
# The name of the release. For example, `Release v1.0.1`
54+
release_name: Release ${{ github.ref }}
55+
- name: Get the version
56+
id: get_version
57+
run: |
58+
echo ::set-output name=VERSION::${GITHUB_REF#refs/tags/}
59+
echo ::set-output name=DEB_NAME::$(basename $(ls ${{ runner.temp }}/deb-package/*.deb | tail -n 1))
60+
- uses: actions/cache@v2
61+
with:
62+
path: |
63+
~/.cargo/registry
64+
~/.cargo/git
65+
target/release
66+
target/debug
67+
target/.rustc_info.json
68+
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
69+
- uses: actions-rs/toolchain@v1
70+
with:
71+
toolchain: stable
72+
- name: Upload crate
73+
uses: actions-rs/cargo@v1
74+
with:
75+
command: publish
76+
args: --token ${{ secrets.CARGO_TOKEN }}

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/target
2+
Cargo.lock

.vscode/settings.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"python.formatting.provider": "black",
3+
"files.eol": "\n"
4+
}

Cargo.toml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[package]
2+
name = "oem_cp"
3+
version = "0.1.0"
4+
authors = ["Tatsunori Uchino <[email protected]>"]
5+
edition = "2018"
6+
categories = ["encoding"]
7+
keywords = ["encoding", "charset"]
8+
license = "MIT"
9+
10+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
11+
12+
[dependencies]
13+
hashbrown = "0.8.1"
14+
lazy_static = "1.4.0"

LICENSE

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Copyright © 2020 Tatsunori Uchino <[email protected]>
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4+
5+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6+
7+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Rust library for OEM Code pages
2+
3+
This library handles many SBCS (single byte character sets) that are used as OEM code pages. OEM code pages are used for encoding file names in ZIP archives and characters in the terminal in Windows today.
4+
5+
# Supported code pages
6+
7+
| Code Page | Note |
8+
| --------- | ------------------------------------------------ |
9+
| 437 | OEM United States |
10+
| 737 | OEM Greek (formerly 437G); Greek (DOS) |
11+
| 775 | OEM Baltic; Baltic (DOS) |
12+
| 850 | OEM Multilingual Latin 1; Western European (DOS) |
13+
| 852 | OEM Latin 2; Central European (DOS) |
14+
| 855 | OEM Cyrillic (primarily Russian) |
15+
| 857 | OEM Turkish; Turkish (DOS) |
16+
| 860 | OEM Portuguese; Portuguese (DOS) |
17+
| 861 | OEM Icelandic; Icelandic (DOS) |
18+
| 862 | OEM Hebrew; Hebrew (DOS) |
19+
| 863 | OEM French Canadian; French Canadian (DOS) |
20+
| 864 | OEM Arabic; Arabic (864) |
21+
| 865 | OEM Nordic; Nordic (DOS) |
22+
| 866 | OEM Russian; Cyrillic (DOS) |
23+
| 869 | OEM Modern Greek; Greek, Modern (DOS) |
24+
| 874 | ANSI/OEM Thai (ISO 8859-11); Thai (Windows) |
25+
26+
Notes are quoted from https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
27+
28+
# Examples
29+
30+
## Use specific code pages
31+
32+
### Encoding Unicode string to SBCS bytes
33+
34+
```rust
35+
use oem_cp::{encoding_string_checked, encoding_string_lossy};
36+
use oem_cp::code_table::{ENCODING_TABLE_CP437, ENCODING_TABLE_CP737};
37+
38+
assert_eq!(encode_string_checked("π≈22/7", &*ENCODING_TABLE_CP437), Some(vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]));
39+
// Archimedes in Greek
40+
assert_eq!(encode_string_checked("Αρχιμήδης", &*ENCODING_TABLE_CP737), Some(vec![0x80, 0xA8, 0xAE, 0xA0, 0xA3, 0xE3, 0x9B, 0x9E, 0xAA]));
41+
// ¾ (U+00BE) is not included in CP437
42+
assert_eq!(encoding_string_checked("½+¼=¾", &*ENCODING_TABLE_CP437), None);
43+
// Unknown characters can be replaced with ? (0x3F)
44+
assert_eq!(encoding_string_lossy("½+¼=¾", &*ENCODING_TABLE_CP437), vec![0xAB, 0x2B, 0xAC, 0x3D, 0x3F]);
45+
```
46+
47+
### Decoding SBCS bytes to Unicode string
48+
49+
```rust
50+
use oem_cp::{decode_string_complete_table, decode_string_incomplete_table_checked, decode_string_incomplete_table_lossy};
51+
use oem_cp::code_table::{DECODING_TABLE_CP437, DECODING_TABLE_CP874};
52+
53+
assert_eq!(&decode_string_complete_table(vec![0xFB, 0xAC, 0x3D, 0xAB], &DECODING_TABLE_CP437), "√¼=½");
54+
55+
// For encoding that has some undefined code points, you must use decode_string_incomplete_table_{checked,lossy} instead of decode_string_complete_table
56+
57+
// means shrimp in Thai (U+E49 => 0xE9)
58+
assert_eq!(decode_string_incomplete_table_checked(vec![0xA1, 0xD8, 0xE9, 0xA7], &DECODING_TABLE_CP874), Some("กุ้ง".to_string()));
59+
// 0x81-0x84,0x86-0x90,0x98-0x9F is invalid in CP874
60+
assert_eq!(decode_string_incomplete_table_checked(vec![0x30, 0x81], &DECODING_TABLE_CP874), None);
61+
// You can use decode_string_incomplete_table_lossy instead
62+
assert_eq!(&decode_string_incomplete_table_lossy(vec![0xA1, 0xD8, 0xE9, 0xA7], &DECODING_TABLE_CP874), "กุ้ง");
63+
// Undefined code points are replaced with U+FFFD (replacement character)
64+
assert_eq!(&decode_string_incomplete_table_lossy(vec![0x30, 0x81], &DECODING_TABLE_CP874), "0\u{FFFD}");
65+
```
66+
67+
## Select appropriate codepage from integer
68+
69+
```rust
70+
use oem_cp::code_table::{ENCODING_TABLE_CP_MAP, DECODING_TABLE_CP_MAP};
71+
use oem_cp::{encoding_string_checked, encoding_string_lossy};
72+
73+
if let Some(cp874_table) = (*DECODING_TABLE_CP_MAP).get(&874) {
74+
assert_eq!(cp874_table.decode_string_checked(vec![0xA1, 0xD8, 0xE9, 0xA7]), Some("กุ้ง".to_string()));
75+
// undefined mapping 0x81 for CP874
76+
assert_eq!(cp874_table.decode_string_checked(vec![0x81]), None);
77+
assert_eq!(&cp874_table.decode_string_lossy(vec![0x81]), "\u{FFFD}");
78+
} else {
79+
panic!("Why the hell CP874 isn't registered?");
80+
}
81+
82+
if let Some(cp437_table) = (*ENCODING_TABLE_CP_MAP).get(&437) {
83+
assert_eq!(encode_string_checked("π≈22/7", cp437_table), Some(vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]));
84+
// ¾ is undefined in CP437
85+
assert_eq!(encoding_string_checked("½+¼=¾", cp437_table), None);
86+
// It's replaced with ? (0x3F)
87+
assert_eq!(encoding_string_lossy("½+¼=¾", cp437_table), vec![0xAB, 0x2B, 0xAC, 0x3D, 0x3F]);
88+
} else {
89+
panic!("Why the hell CP437 isn't registered?");
90+
}
91+
```
92+
93+
# Support for ANSI/EBCDIC/MBCS code pages
94+
95+
For ANSI (125x) and MBCS (932-950; for CJK languages) code pages, please use [encoding_rs](https://github.com/hsivonen/encoding_rs) instead.
96+
97+
This library is only for extended ASCII encodings (0x00-0x80 must be compatible with ASCII), so EBCDIC encodings will never be supported.
98+
99+
# Symbols from 0x01 to 0x19
100+
101+
This library doesn't support [symbols mapped from 0x01 to 0x19 in CP437](https://en.wikipedia.org/wiki/Code_page_437). 0x01-0x19 are mapped to U+0001-U+0019. If you prefer symbols, use [codepage_437](https://github.com/nabijaczleweli/codepage-437) instead.
102+
103+
# Licenses
104+
105+
MIT

0 commit comments

Comments
 (0)