diff --git a/.gitmodules b/.gitmodules index cacc7f2..2b92ae9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "uchardet-sys/uchardet"] path = uchardet-sys/uchardet - url = https://github.com/BYVoid/uchardet + url = https://anongit.freedesktop.org/git/uchardet/uchardet.git diff --git a/.travis.yml b/.travis.yml index 5728907..913a4cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,7 @@ sudo: required rust: - nightly - beta -- 1.0.0 +- stable before_script: - | pip install 'travis-cargo<0.2' --user && diff --git a/Cargo.toml b/Cargo.toml index 94ab6cf..2c07ca3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ unstable = [] [dependencies] libc = "*" +error-chain = "0.5" [dependencies.uchardet-sys] path = "uchardet-sys" diff --git a/src/lib.rs b/src/lib.rs index 5168969..4af5ba9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -//! A wrapper around the uchardet library. Detects character encodings. +//! A wrapper around the uchardet library. Detects character encodings. //! //! Note that the underlying implemention is written in C and C++, and I'm //! not aware of any security audits which have been performed against it. @@ -6,46 +6,60 @@ //! ``` //! use uchardet::detect_encoding_name; //! -//! assert_eq!(Some("windows-1252".to_string()), -//! detect_encoding_name(&[0x66u8, 0x72, 0x61, 0x6e, 0xe7, -//! 0x61, 0x69, 0x73]).unwrap()); +//! assert_eq!("WINDOWS-1252", +//! detect_encoding_name(&[0x46, 0x93, 0x72, 0x61, 0x6e, 0xe7, 0x6f, +//! 0x69, 0x73, 0xe9, 0x94]).unwrap()); //! ``` //! //! For more information, see [this project on //! GitHub](https://github.com/emk/rust-uchardet). +// Increase the compiler's recursion limit for the `error_chain` crate. +#![recursion_limit = "1024"] #![deny(missing_docs)] +#[macro_use] +extern crate error_chain; extern crate libc; extern crate uchardet_sys as ffi; use libc::size_t; -use std::error::Error; -use std::fmt; -use std::result::Result; use std::ffi::CStr; use std::str::from_utf8; -/// An error occurred while trying to detect the character encoding. -#[derive(Debug)] -pub struct EncodingDetectorError { - message: String -} +pub use errors::*; -impl Error for EncodingDetectorError { - fn description(&self) -> &str { "encoding detector error" } - fn cause(&self) -> Option<&Error> { None } -} +#[allow(missing_docs)] +mod errors { + error_chain! { + errors { + UnrecognizableCharset { + description("unrecognizable charset") + display("uchardet was unable to recognize a charset") + } + OutOfMemory { + description("out of memory error") + display("uchardet ran out of memory") + } + Other(int: i32) { + description("unknown error") + display("uchardet returned unknown error {}", int) + } + } + } + + impl ErrorKind { + pub fn from_nsresult(nsresult: ::ffi::nsresult) -> ErrorKind { + assert!(nsresult != 0); + match nsresult { + 1 => ErrorKind::OutOfMemory, + int => ErrorKind::Other(int), + } + } -impl fmt::Display for EncodingDetectorError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", &self.message) } } -/// Either a return value, or an encoding detection error. -pub type EncodingDetectorResult = Result; - /// Detects the encoding of text using the uchardet library. /// /// EXPERIMENTAL: This may be replaced by a better API soon. @@ -53,28 +67,25 @@ struct EncodingDetector { ptr: ffi::uchardet_t } -/// Return the name of the charset used in `data`, or `None` if the -/// charset is ASCII or if the encoding can't be detected. This is -/// the value returned by the underlying `uchardet` library, with -/// the empty string mapped to `None`. +/// Return the name of the charset used in `data` or an error if uchardet +/// was unable to detect a charset. /// /// ``` /// use uchardet::detect_encoding_name; /// -/// assert_eq!(None, detect_encoding_name("ascii".as_bytes()).unwrap()); -/// assert_eq!(Some("UTF-8".to_string()), -/// detect_encoding_name("français".as_bytes()).unwrap()); -/// assert_eq!(Some("windows-1252".to_string()), -/// detect_encoding_name(&[0x66u8, 0x72, 0x61, 0x6e, 0xe7, -/// 0x61, 0x69, 0x73]).unwrap()); +/// assert_eq!("ASCII", +/// detect_encoding_name("ascii".as_bytes()).unwrap()); +/// assert_eq!("UTF-8", +/// detect_encoding_name("©français".as_bytes()).unwrap()); +/// assert_eq!("WINDOWS-1252", +/// detect_encoding_name(&[0x46, 0x93, 0x72, 0x61, 0x6e, 0xe7, 0x6f, +/// 0x69, 0x73, 0xe9, 0x94]).unwrap()); /// ``` -pub fn detect_encoding_name(data: &[u8]) -> - EncodingDetectorResult> -{ +pub fn detect_encoding_name(data: &[u8]) -> Result { let mut detector = EncodingDetector::new(); try!(detector.handle_data(data)); detector.data_end(); - Ok(detector.charset()) + detector.charset() } impl EncodingDetector { @@ -85,39 +96,38 @@ impl EncodingDetector { EncodingDetector{ptr: ptr} } - /// Pass a chunk of raw bytes to the detector. This is a no-op if a + /// Pass a chunk of raw bytes to the detector. This is a no-op if a /// charset has been detected. - fn handle_data(&mut self, data: &[u8]) -> EncodingDetectorResult<()> { - let result = unsafe { + fn handle_data(&mut self, data: &[u8]) -> Result<()> { + let nsresult = unsafe { ffi::uchardet_handle_data(self.ptr, data.as_ptr() as *const i8, data.len() as size_t) }; - match result { + match nsresult { 0 => Ok(()), - _ => { - let msg = "Error handling data".to_string(); - Err(EncodingDetectorError{message: msg}) + int => { + Err(ErrorKind::from_nsresult(int).into()) } } } /// Notify the detector that we're done calling `handle_data`, and that - /// we want it to make a guess as to our encoding. This is a no-op if + /// we want it to make a guess as to our encoding. This is a no-op if /// no data has been passed yet, or if an encoding has been detected - /// for certain. From reading the code, it appears that you can safely + /// for certain. From reading the code, it appears that you can safely /// call `handle_data` after calling this, but I'm not certain. fn data_end(&mut self) { unsafe { ffi::uchardet_data_end(self.ptr); } } /// Reset the detector's internal state. - //fn reset(&mut self) { + // fn reset(&mut self) { // unsafe { ffi::uchardet_reset(self.ptr); } - //} + // } - /// Get the decoder's current best guess as to the encoding. Returns - /// `None` on error, or if the data appears to be ASCII. - fn charset(&self) -> Option { + /// Get the decoder's current best guess as to the encoding. May return + /// an error if uchardet was unable to detect an encoding. + fn charset(&self) -> Result { unsafe { let internal_str = ffi::uchardet_get_charset(self.ptr); assert!(!internal_str.is_null()); @@ -125,9 +135,10 @@ impl EncodingDetector { let charset = from_utf8(bytes); match charset { Err(_) => - panic!("uchardet_get_charset returned invalid value"), - Ok("") => None, - Ok(encoding) => Some(encoding.to_string()) + panic!("uchardet_get_charset returned a charset name \ + containing invalid characters"), + Ok("") => Err(ErrorKind::UnrecognizableCharset.into()), + Ok(encoding) => Ok(encoding.to_string()) } } } diff --git a/uchardet-sys/Cargo.toml b/uchardet-sys/Cargo.toml index e85d9ca..53719a5 100644 --- a/uchardet-sys/Cargo.toml +++ b/uchardet-sys/Cargo.toml @@ -18,3 +18,4 @@ libc = "*" [build-dependencies] pkg-config = '*' +cmake = "*" diff --git a/uchardet-sys/build.rs b/uchardet-sys/build.rs index ff9f97a..6c8605c 100644 --- a/uchardet-sys/build.rs +++ b/uchardet-sys/build.rs @@ -5,71 +5,40 @@ // Patches are welcome to help make it work on other operating systems! extern crate pkg_config; +extern crate cmake; -use std::env; -use std::fs::create_dir; -use std::path::Path; -use std::process::{Command, Stdio}; +use cmake::Config; fn main() { // Do nothing if this package is already provided by the system. if pkg_config::find_library("uchardet").is_ok() { return; } - // Get our configuration from our environment. - let mut cxxflags = env::var("CXXFLAGS").unwrap_or(String::new()); - let target = env::var("TARGET").unwrap(); - let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); - let src = Path::new(&manifest_dir); - let out_dir = env::var("OUT_DIR").unwrap(); - let dst = Path::new(&out_dir); + // Build uchardet ourselves + let mut config = Config::new("uchardet"); - // Fix up our build flags. - if target.contains("i686") { - cxxflags.push_str(" -m32"); - } else if target.contains("x86_64") { - cxxflags.push_str(" -m64"); - } - if !target.contains("i686") { - cxxflags.push_str(" -fPIC"); - } + // Mustn't build the binaries as they aren't compatible with Windows + // and cause a compiler error + config.define("BUILD_BINARY", "OFF"); - // Make a build directory. - let build = dst.join("build"); - // This isn't ideal but until is_dir() is stable just always try to create - // the build directory. If it exists and error is returned, which is - // ignored. - create_dir(&build).unwrap_or(()); - - // Set up our CMake command. - run(Command::new("cmake") - .current_dir(&dst.join("build")) - .arg(&src.join("uchardet")) - .arg("-DCMAKE_BUILD_TYPE=Release") - .arg(&format!("-DCMAKE_INSTALL_PREFIX={}", dst.display())) - .arg(&format!("-DCMAKE_CXX_FLAGS={}", cxxflags))); - - // Run our make command. - run(Command::new("make") - .current_dir(&dst.join("build")) - .arg("install")); + if cfg!(target_os = "windows") && cfg!(target_env = "gnu") { + // Disable sized deallocation as we're unable to link when it's enabled + config.cxxflag("-fno-sized-deallocation"); + } - // Decide how to link our C++ runtime. Feel free to submit patches - // to make this work on your platform. Other likely options are "c++" - // and "c++abi" depending on OS and compiler. - let cxx_abi = "stdc++"; + let dst = config.build(); // Print out link instructions for Cargo. - println!("cargo:rustc-flags=-L {} -l static=uchardet -l {}", - dst.join("lib").display(), cxx_abi); - println!("cargo:root={}", dst.display()); -} - -// Run an external build command. -fn run(cmd: &mut Command) { - println!("running: {:?}", cmd); - assert!(cmd.stdout(Stdio::inherit()) - .stderr(Stdio::inherit()) - .status() - .unwrap() - .success()); + println!("cargo:rustc-link-search=native={}/lib", dst.display()); + println!("cargo:rustc-link-search=native={}/lib64", dst.display()); + println!("cargo:rustc-link-lib=static=uchardet"); + + if !(cfg!(target_os = "windows") && cfg!(target_env = "msvc")) { + // Not needed on windows-msvc + + // Decide how to link our C++ runtime. Feel free to submit patches + // to make this work on your platform. Other likely options are "c++" + // and "c++abi" depending on OS and compiler. + let cxx_abi = "stdc++"; + println!("cargo:rustc-flags=-l {}", cxx_abi); + } } diff --git a/uchardet-sys/src/lib.rs b/uchardet-sys/src/lib.rs index 035d9dd..fa5cecb 100644 --- a/uchardet-sys/src/lib.rs +++ b/uchardet-sys/src/lib.rs @@ -8,11 +8,14 @@ use libc::{c_char, c_int, c_void, size_t}; #[allow(non_camel_case_types)] pub type uchardet_t = *mut c_void; +#[allow(non_camel_case_types)] +pub type nsresult = c_int; + extern { pub fn uchardet_new() -> uchardet_t; pub fn uchardet_delete(ud: uchardet_t); pub fn uchardet_handle_data(ud: uchardet_t, data: *const c_char, - len: size_t) -> c_int; + len: size_t) -> nsresult; pub fn uchardet_data_end(ud: uchardet_t); pub fn uchardet_reset(ud: uchardet_t); pub fn uchardet_get_charset(ud: uchardet_t) -> *const c_char; diff --git a/uchardet-sys/uchardet b/uchardet-sys/uchardet index 69b7133..119fed7 160000 --- a/uchardet-sys/uchardet +++ b/uchardet-sys/uchardet @@ -1 +1 @@ -Subproject commit 69b7133995e4ee260b093323c57a7f8c6c6803b8 +Subproject commit 119fed7e8dcb7b9e72457ff2b268a61d2264f12d