vtparse-0.6.2/.cargo_vcs_info.json0000644000000001450000000000100125250ustar { "git": { "sha1": "edeae72b5fc55c7fa4aa1d08bbefd08c5493f757" }, "path_in_vcs": "vtparse" }vtparse-0.6.2/Cargo.toml0000644000000016460000000000100105320ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "vtparse" version = "0.6.2" authors = ["Wez Furlong "] description = "Low level escape sequence parser" documentation = "https://docs.rs/vtparse" readme = "README.md" keywords = [ "terminal", "escape", "ansi", "sequence", "parser", ] license = "MIT" repository = "https://github.com/wez/wezterm" resolver = "2" [dependencies.utf8parse] version = "0.2" [dev-dependencies.k9] version = "0.11" vtparse-0.6.2/Cargo.toml.orig000064400000000000000000000006350072674642500142400ustar 00000000000000[package] authors = ["Wez Furlong "] name = "vtparse" version = "0.6.2" edition = "2018" repository = "https://github.com/wez/wezterm" description = "Low level escape sequence parser" license = "MIT" documentation = "https://docs.rs/vtparse" keywords = ["terminal", "escape", "ansi", "sequence", "parser"] readme = "README.md" [dependencies] utf8parse = "0.2" [dev-dependencies] k9 = "0.11" vtparse-0.6.2/README.md000064400000000000000000000013040072674642500126220ustar 00000000000000# vtparse This is an implementation of a parser for escape and control sequences. It is based on the [DEC ANSI Parser](https://vt100.net/emu/dec_ansi_parser). It has been modified slightly to support UTF-8 sequences. `vtparse` is the lowest level parser; it categorizes the basic types of sequences but does not ascribe any semantic meaning to them. You may wish to look at `termwiz::escape::parser::Parser` in the [termwiz](https://docs.rs/termwiz) crate if you're looking for semantic parsing. ## Comparison with the `vte` crate `vtparse` has support for dynamically sized OSC buffers, which makes it suitable for processing large escape sequences, such as those used by the `iTerm2` image protocol. vtparse-0.6.2/src/enums.rs000064400000000000000000000022050072674642500136300ustar 00000000000000#![allow(dead_code)] #[derive(Debug, Copy, Clone, Eq, PartialEq)] #[repr(u16)] pub enum Action { None = 0, Ignore = 1, Print = 2, Execute = 3, Clear = 4, Collect = 5, Param = 6, EscDispatch = 7, CsiDispatch = 8, Hook = 9, Put = 10, Unhook = 11, OscStart = 12, OscPut = 13, OscEnd = 14, Utf8 = 15, ApcStart = 16, ApcPut = 17, ApcEnd = 18, } impl Action { #[inline(always)] pub fn from_u16(v: u16) -> Self { unsafe { std::mem::transmute(v) } } } #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] #[repr(u16)] pub enum State { Ground = 0, Escape = 1, EscapeIntermediate = 2, CsiEntry = 3, CsiParam = 4, CsiIntermediate = 5, CsiIgnore = 6, DcsEntry = 7, DcsParam = 8, DcsIntermediate = 9, DcsPassthrough = 10, DcsIgnore = 11, OscString = 12, SosPmString = 13, ApcString = 14, // Special states, always last (no tables for these) Anywhere = 15, Utf8Sequence = 16, } impl State { #[inline(always)] pub fn from_u16(v: u16) -> Self { unsafe { std::mem::transmute(v) } } } vtparse-0.6.2/src/lib.rs000064400000000000000000001112310072674642500132470ustar 00000000000000//! An implementation of the state machine described by //! [DEC ANSI Parser](https://vt100.net/emu/dec_ansi_parser), modified to support UTF-8. //! //! This is sufficient to broadly categorize ANSI/ECMA-48 escape sequences that are //! commonly used in terminal emulators. It does not ascribe semantic meaning to //! those escape sequences; for example, if you wish to parse the SGR sequence //! that makes text bold, you will need to know which codes correspond to bold //! in your implementation of `VTActor`. //! //! You may wish to use `termwiz::escape::parser::Parser` in the //! [termwiz](https://docs.rs/termwiz/) crate if you don't want to have to research //! all those possible escape sequences for yourself. #![allow(clippy::upper_case_acronyms)] use utf8parse::Parser as Utf8Parser; mod enums; use crate::enums::*; mod transitions; use transitions::{ENTRY, EXIT, TRANSITIONS}; #[inline(always)] fn lookup(state: State, b: u8) -> (Action, State) { let v = unsafe { TRANSITIONS .get_unchecked(state as usize) .get_unchecked(b as usize) }; (Action::from_u16(v >> 8), State::from_u16(v & 0xff)) } #[inline(always)] #[cfg(not(test))] fn lookup_entry(state: State) -> Action { unsafe { *ENTRY.get_unchecked(state as usize) } } #[inline(always)] #[cfg(test)] fn lookup_entry(state: State) -> Action { *ENTRY .get(state as usize) .unwrap_or_else(|| panic!("State {:?} has no entry in ENTRY", state)) } #[inline(always)] #[cfg(test)] fn lookup_exit(state: State) -> Action { *EXIT .get(state as usize) .unwrap_or_else(|| panic!("State {:?} has no entry in EXIT", state)) } #[inline(always)] #[cfg(not(test))] fn lookup_exit(state: State) -> Action { unsafe { *EXIT.get_unchecked(state as usize) } } /// `VTActor` is a trait that allows the host application to process /// the different kinds of sequence as they are parsed from the input /// stream. /// /// The functions defined by this trait correspond to the actions defined /// in the [state machine](https://vt100.net/emu/dec_ansi_parser). /// /// ## Terminology: /// An intermediate is a character in the range 0x20-0x2f that /// occurs before the final character in an escape sequence. /// /// `ignored_excess_intermediates` is a boolean that is set in the case /// where there were more than two intermediate characters; no standard /// defines any codes with more than two. Intermediates after /// the second will set this flag and are discarded. /// /// `params` in most of the functions of this trait are decimal integer parameters in escape /// sequences. They are separated by semicolon characters. An omitted parameter is returned in /// this interface as a zero, which represents the default value for that parameter. /// /// Other jargon used here is defined in /// [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf). pub trait VTActor { /// The current code should be mapped to a glyph according to the character set mappings and /// shift states in effect, and that glyph should be displayed. /// /// If the input was UTF-8 then it will have been mapped to a unicode code point. Invalid /// sequences are represented here using the unicode REPLACEMENT_CHARACTER. /// /// Otherwise the parameter will be a 7-bit printable value and may be subject to mapping /// depending on other state maintained by the embedding application. /// /// ## Some commentary from the state machine documentation: /// GL characters (20 to 7F) are /// printed. 20 (SP) and 7F (DEL) are included in this area, although both codes have special /// behaviour. If a 94-character set is mapped into GL, 20 will cause a space to be displayed, /// and 7F will be ignored. When a 96-character set is mapped into GL, both 20 and 7F may cause /// a character to be displayed. Later models of the VT220 included the DEC Multinational /// Character Set (MCS), which has 94 characters in its supplemental set (i.e. the characters /// supplied in addition to ASCII), so terminals only claiming VT220 compatibility can always /// ignore 7F. The VT320 introduced ISO Latin-1, which has 96 characters in its supplemental /// set, so emulators with a VT320 compatibility mode need to treat 7F as a printable /// character. fn print(&mut self, b: char); /// The C0 or C1 control function should be executed, which may have any one of a variety of /// effects, including changing the cursor position, suspending or resuming communications or /// changing the shift states in effect. /// /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf) /// for more information on C0 and C1 control functions. fn execute_c0_or_c1(&mut self, control: u8); /// invoked when a final character arrives in the first part of a device control string. It /// determines the control function from the private marker, intermediate character(s) and /// final character, and executes it, passing in the parameter list. It also selects a handler /// function for the rest of the characters in the control string. /// /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf) /// for more information on device control strings. fn dcs_hook( &mut self, mode: u8, params: &[i64], intermediates: &[u8], ignored_excess_intermediates: bool, ); /// This action passes characters from the data string part of a device control string to a /// handler that has previously been selected by the dcs_hook action. C0 controls are also /// passed to the handler. /// /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf) /// for more information on device control strings. fn dcs_put(&mut self, byte: u8); /// When a device control string is terminated by ST, CAN, SUB or ESC, this action calls the /// previously selected handler function with an “end of data” parameter. This allows the /// handler to finish neatly. /// /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf) /// for more information on device control strings. fn dcs_unhook(&mut self); /// The final character of an escape sequence has arrived, so determine the control function /// to be executed from the intermediate character(s) and final character, and execute it. /// /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf) /// for more information on escape sequences. fn esc_dispatch( &mut self, params: &[i64], intermediates: &[u8], ignored_excess_intermediates: bool, byte: u8, ); /// A final character of a Control Sequence Initiator has arrived, so determine the control function to be executed from /// private marker, intermediate character(s) and final character, and execute it, passing in /// the parameter list. /// /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf) /// for more information on control functions. fn csi_dispatch(&mut self, params: &[CsiParam], parameters_truncated: bool, byte: u8); /// Called when an OSC string is terminated by ST, CAN, SUB or ESC. /// /// `params` is an array of byte strings (which may also be valid utf-8) /// that were passed as semicolon separated parameters to the operating /// system command. fn osc_dispatch(&mut self, params: &[&[u8]]); /// Called when an APC string is terminated by ST /// `data` is the data contained within the APC sequence. fn apc_dispatch(&mut self, data: Vec); } /// `VTAction` is an alternative way to work with the parser; rather /// than implementing the VTActor trait you can use `CollectingVTActor` /// to capture the sequence of events into a `Vec`. #[derive(Debug, Clone, Eq, PartialEq)] pub enum VTAction { Print(char), ExecuteC0orC1(u8), DcsHook { params: Vec, intermediates: Vec, ignored_excess_intermediates: bool, byte: u8, }, DcsPut(u8), DcsUnhook, EscDispatch { params: Vec, intermediates: Vec, ignored_excess_intermediates: bool, byte: u8, }, CsiDispatch { params: Vec, parameters_truncated: bool, byte: u8, }, OscDispatch(Vec>), ApcDispatch(Vec), } /// This is an implementation of `VTActor` that captures the events /// into an internal vector. /// It can be iterated via `into_iter` or have the internal /// vector extracted via `into_vec`. #[derive(Default)] pub struct CollectingVTActor { actions: Vec, } impl IntoIterator for CollectingVTActor { type Item = VTAction; type IntoIter = std::vec::IntoIter; fn into_iter(self) -> Self::IntoIter { self.actions.into_iter() } } impl CollectingVTActor { pub fn into_vec(self) -> Vec { self.actions } } impl VTActor for CollectingVTActor { fn print(&mut self, b: char) { self.actions.push(VTAction::Print(b)); } fn execute_c0_or_c1(&mut self, control: u8) { self.actions.push(VTAction::ExecuteC0orC1(control)); } fn dcs_hook( &mut self, byte: u8, params: &[i64], intermediates: &[u8], ignored_excess_intermediates: bool, ) { self.actions.push(VTAction::DcsHook { byte, params: params.to_vec(), intermediates: intermediates.to_vec(), ignored_excess_intermediates, }); } fn dcs_put(&mut self, byte: u8) { self.actions.push(VTAction::DcsPut(byte)); } fn dcs_unhook(&mut self) { self.actions.push(VTAction::DcsUnhook); } fn esc_dispatch( &mut self, params: &[i64], intermediates: &[u8], ignored_excess_intermediates: bool, byte: u8, ) { self.actions.push(VTAction::EscDispatch { params: params.to_vec(), intermediates: intermediates.to_vec(), ignored_excess_intermediates, byte, }); } fn csi_dispatch(&mut self, params: &[CsiParam], parameters_truncated: bool, byte: u8) { self.actions.push(VTAction::CsiDispatch { params: params.to_vec(), parameters_truncated, byte, }); } fn osc_dispatch(&mut self, params: &[&[u8]]) { self.actions.push(VTAction::OscDispatch( params.iter().map(|i| i.to_vec()).collect(), )); } fn apc_dispatch(&mut self, data: Vec) { self.actions.push(VTAction::ApcDispatch(data)); } } const MAX_INTERMEDIATES: usize = 2; const MAX_OSC: usize = 64; const MAX_PARAMS: usize = 32; struct OscState { buffer: Vec, param_indices: [usize; MAX_OSC], num_params: usize, full: bool, } impl OscState { fn put(&mut self, param: char) { if param == ';' { match self.num_params { MAX_OSC => { self.full = true; } num => { self.param_indices[num.saturating_sub(1)] = self.buffer.len(); self.num_params += 1; } } } else if !self.full { if self.num_params == 0 { self.num_params = 1; } let mut buf = [0u8; 8]; self.buffer .extend_from_slice(param.encode_utf8(&mut buf).as_bytes()); } } } /// The virtual terminal parser. It works together with an implementation of `VTActor`. pub struct VTParser { state: State, intermediates: [u8; MAX_INTERMEDIATES], num_intermediates: usize, ignored_excess_intermediates: bool, osc: OscState, params: [CsiParam; MAX_PARAMS], num_params: usize, current_param: Option, params_full: bool, apc_data: Vec, utf8_parser: Utf8Parser, utf8_return_state: State, } /// Represents a parameter to a CSI-based escaped sequence. /// /// CSI escapes typically have the form: `CSI 3 m`, but can also /// bundle multiple values together: `CSI 3 ; 4 m`. In both /// of those examples the parameters are simple integer values /// and latter of which would be expressed as a slice containing /// `[CsiParam::Integer(3), CsiParam::Integer(4)]`. /// /// There are some escape sequences that use colons to subdivide and /// extend the meaning. For example: `CSI 4:3 m` is a sequence used /// to denote a curly underline. That would be represented as: /// `[CsiParam::ColonList(vec![Some(4), Some(3)])]`. /// /// Later: reading ECMA 48, CSI is defined as: /// CSI P ... P I ... I F /// Where P are parameter bytes in the range 0x30-0x3F [0-9:;<=>?] /// and I are intermediate bytes in the range 0x20-0x2F /// and F is the final byte in the range 0x40-0x7E /// #[derive(Clone, PartialEq, Eq, Debug, Hash)] pub enum CsiParam { Integer(i64), P(u8), } impl Default for CsiParam { fn default() -> Self { Self::Integer(0) } } impl CsiParam { pub fn as_integer(&self) -> Option { match self { Self::Integer(i) => Some(*i), _ => None, } } } impl std::fmt::Display for CsiParam { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { CsiParam::Integer(v) => { write!(f, "{}", v)?; } CsiParam::P(p) => { write!(f, "{}", *p as char)?; } } Ok(()) } } impl VTParser { #[allow(clippy::new_without_default)] pub fn new() -> Self { let param_indices = [0usize; MAX_OSC]; Self { state: State::Ground, utf8_return_state: State::Ground, intermediates: [0, 0], num_intermediates: 0, ignored_excess_intermediates: false, osc: OscState { buffer: Vec::new(), param_indices, num_params: 0, full: false, }, params: Default::default(), num_params: 0, params_full: false, current_param: None, utf8_parser: Utf8Parser::new(), apc_data: vec![], } } /// Returns if the state machine is in the ground state, /// i.e. there is no pending state held by the state machine. pub fn is_ground(&self) -> bool { self.state == State::Ground } fn as_integer_params(&self) -> [i64; MAX_PARAMS] { let mut res = [0i64; MAX_PARAMS]; let mut i = 0; for src in &self.params[0..self.num_params] { if let CsiParam::Integer(value) = src { res[i] = *value; i += 1; } } res } fn finish_param(&mut self) { if let Some(val) = self.current_param.take() { if self.num_params < MAX_PARAMS { self.params[self.num_params] = val; self.num_params += 1; } } } /// Promote early intermediates to parameters. /// This is handle sequences such as DECSET that use `?` /// prior to other numeric parameters. /// `?` is technically in the intermediate range and shouldn't /// appear in the parameter position according to ECMA 48 fn promote_intermediates_to_params(&mut self) { if self.num_intermediates > 0 { for &p in &self.intermediates[..self.num_intermediates] { if self.num_params >= MAX_PARAMS { self.ignored_excess_intermediates = true; break; } self.params[self.num_params] = CsiParam::P(p); self.num_params += 1; } self.num_intermediates = 0; } } fn action(&mut self, action: Action, param: u8, actor: &mut dyn VTActor) { match action { Action::None | Action::Ignore => {} Action::Print => actor.print(param as char), Action::Execute => actor.execute_c0_or_c1(param), Action::Clear => { self.num_intermediates = 0; self.ignored_excess_intermediates = false; self.osc.num_params = 0; self.osc.full = false; self.num_params = 0; self.params_full = false; self.current_param.take(); self.apc_data.clear(); } Action::Collect => { if self.num_intermediates < MAX_INTERMEDIATES { self.intermediates[self.num_intermediates] = param; self.num_intermediates += 1; } else { self.ignored_excess_intermediates = true; } } Action::Param => { if self.params_full { return; } self.promote_intermediates_to_params(); match param { b'0'..=b'9' => match self.current_param.take() { Some(CsiParam::Integer(i)) => { self.current_param.replace(CsiParam::Integer( i.saturating_mul(10).saturating_add((param - b'0') as i64), )); } Some(_) => unreachable!(), None => { self.current_param .replace(CsiParam::Integer((param - b'0') as i64)); } }, p => { self.finish_param(); if self.num_params + 1 > MAX_PARAMS { self.params_full = true; } else { self.params[self.num_params] = CsiParam::P(p); self.num_params += 1; } } } } Action::Hook => { self.finish_param(); actor.dcs_hook( param, &self.as_integer_params()[0..self.num_params], &self.intermediates[0..self.num_intermediates], self.ignored_excess_intermediates, ); } Action::Put => actor.dcs_put(param), Action::EscDispatch => { self.finish_param(); actor.esc_dispatch( &self.as_integer_params()[0..self.num_params], &self.intermediates[0..self.num_intermediates], self.ignored_excess_intermediates, param, ); } Action::CsiDispatch => { self.finish_param(); self.promote_intermediates_to_params(); actor.csi_dispatch( &self.params[0..self.num_params], self.ignored_excess_intermediates, param, ); } Action::Unhook => actor.dcs_unhook(), Action::OscStart => { self.osc.buffer.clear(); self.osc.num_params = 0; self.osc.full = false; } Action::OscPut => self.osc.put(param as char), Action::OscEnd => { if self.osc.num_params == 0 { actor.osc_dispatch(&[]); } else { let mut params: [&[u8]; MAX_OSC] = [b""; MAX_OSC]; let mut offset = 0usize; let mut slice = self.osc.buffer.as_slice(); let limit = self.osc.num_params.min(MAX_OSC); #[allow(clippy::needless_range_loop)] for i in 0..limit - 1 { let (a, b) = slice.split_at(self.osc.param_indices[i] - offset); params[i] = a; slice = b; offset = self.osc.param_indices[i]; } params[limit - 1] = slice; actor.osc_dispatch(¶ms[0..limit]); } } Action::ApcStart => { self.apc_data.clear(); } Action::ApcPut => { self.apc_data.push(param); } Action::ApcEnd => { actor.apc_dispatch(std::mem::take(&mut self.apc_data)); } Action::Utf8 => self.next_utf8(actor, param), } } // Process a utf-8 multi-byte sequence. // The state tables emit Action::Utf8 to initiate a multi-byte // sequence, and once we're in the utf-8 state we'll defer to // this method for each byte until the Decode struct is signalled // that we're done. // We use the REPLACEMENT_CHARACTER for invalid sequences. // We return to the ground state after each codepoint, successful // or otherwise. fn next_utf8(&mut self, actor: &mut dyn VTActor, byte: u8) { struct Decoder { codepoint: Option, } impl utf8parse::Receiver for Decoder { fn codepoint(&mut self, c: char) { self.codepoint.replace(c); } fn invalid_sequence(&mut self) { self.codepoint(std::char::REPLACEMENT_CHARACTER); } } let mut decoder = Decoder { codepoint: None }; self.utf8_parser.advance(&mut decoder, byte); if let Some(c) = decoder.codepoint { // Slightly gross special cases C1 controls that were // encoded as UTF-8 rather than emitted as raw 8-bit. // If the decoded value is in the byte range, and that // value would cause a state transition, then we process // that state transition rather than performing the default // string accumulation. if c as u32 <= 0xff { let byte = ((c as u32) & 0xff) as u8; let (action, state) = lookup(self.utf8_return_state, byte); if action == Action::Execute || (state != self.utf8_return_state && state != State::Utf8Sequence) { self.action(lookup_exit(self.utf8_return_state), 0, actor); self.action(action, byte, actor); self.action(lookup_entry(state), 0, actor); self.utf8_return_state = self.state; self.state = state; return; } } match self.utf8_return_state { State::Ground => actor.print(c), State::OscString => self.osc.put(c), state => panic!("unreachable state {:?}", state), }; self.state = self.utf8_return_state; } } /// Parse a single byte. This may result in a call to one of the /// methods on the provided `actor`. #[inline(always)] pub fn parse_byte(&mut self, byte: u8, actor: &mut dyn VTActor) { // While in utf-8 parsing mode, co-opt the vt state // table and instead use the utf-8 state table from the // parser. It will drop us back into the Ground state // after each recognized (or invalid) codepoint. if self.state == State::Utf8Sequence { self.next_utf8(actor, byte); return; } let (action, state) = lookup(self.state, byte); if state != self.state { if state != State::Utf8Sequence { self.action(lookup_exit(self.state), 0, actor); } self.action(action, byte, actor); self.action(lookup_entry(state), byte, actor); self.utf8_return_state = self.state; self.state = state; } else { self.action(action, byte, actor); } } /// Parse a sequence of bytes. The sequence need not be complete. /// This may result in some number of calls to the methods on the /// provided `actor`. pub fn parse(&mut self, bytes: &[u8], actor: &mut dyn VTActor) { for b in bytes { self.parse_byte(*b, actor); } } } #[cfg(test)] mod test { use super::*; use k9::assert_equal as assert_eq; fn parse_as_vec(bytes: &[u8]) -> Vec { let mut parser = VTParser::new(); let mut actor = CollectingVTActor::default(); parser.parse(bytes, &mut actor); actor.into_vec() } #[test] fn test_mixed() { assert_eq!( parse_as_vec(b"yo\x07\x1b[32mwoot\x1b[0mdone"), vec![ VTAction::Print('y'), VTAction::Print('o'), VTAction::ExecuteC0orC1(0x07,), VTAction::CsiDispatch { params: vec![CsiParam::Integer(32)], parameters_truncated: false, byte: b'm', }, VTAction::Print('w',), VTAction::Print('o',), VTAction::Print('o',), VTAction::Print('t',), VTAction::CsiDispatch { params: vec![CsiParam::Integer(0)], parameters_truncated: false, byte: b'm', }, VTAction::Print('d',), VTAction::Print('o',), VTAction::Print('n',), VTAction::Print('e',), ] ); } #[test] fn test_print() { assert_eq!( parse_as_vec(b"yo"), vec![VTAction::Print('y'), VTAction::Print('o')] ); } #[test] fn test_osc_with_c1_st() { assert_eq!( parse_as_vec(b"\x1b]0;there\x9c"), vec![VTAction::OscDispatch(vec![ b"0".to_vec(), b"there".to_vec() ])] ); } #[test] fn test_osc_with_bel_st() { assert_eq!( parse_as_vec(b"\x1b]0;hello\x07"), vec![VTAction::OscDispatch(vec![ b"0".to_vec(), b"hello".to_vec() ])] ); } #[test] fn test_decset() { assert_eq!( parse_as_vec(b"\x1b[?1l"), vec![VTAction::CsiDispatch { params: vec![CsiParam::P(b'?'), CsiParam::Integer(1)], parameters_truncated: false, byte: b'l', },] ); } #[test] fn test_osc_too_many_params() { let fields = (0..MAX_OSC + 2) .into_iter() .map(|i| i.to_string()) .collect::>(); let input = format!("\x1b]{}\x07", fields.join(";")); let actions = parse_as_vec(input.as_bytes()); assert_eq!(actions.len(), 1); match &actions[0] { VTAction::OscDispatch(parsed_fields) => { let fields: Vec<_> = fields.into_iter().map(|s| s.as_bytes().to_vec()).collect(); assert_eq!(parsed_fields.as_slice(), &fields[0..MAX_OSC]); } other => panic!("Expected OscDispatch but got {:?}", other), } } #[test] fn test_osc_with_no_params() { assert_eq!( parse_as_vec(b"\x1b]\x07"), vec![VTAction::OscDispatch(vec![])] ); } #[test] fn test_osc_with_esc_sequence_st() { // This case isn't the same as the other OSC cases; even though // `ESC \` is the long form escape sequence for ST, the ESC on its // own breaks out of the OSC state and jumps into the ESC state, // and that leaves the `\` character to be dispatched there in // the calling application. assert_eq!( parse_as_vec(b"\x1b]woot\x1b\\"), vec![ VTAction::OscDispatch(vec![b"woot".to_vec()]), VTAction::EscDispatch { params: vec![], intermediates: vec![], ignored_excess_intermediates: false, byte: b'\\' } ] ); } #[test] fn test_fancy_underline() { assert_eq!( parse_as_vec(b"\x1b[4m"), vec![VTAction::CsiDispatch { params: vec![CsiParam::Integer(4)], parameters_truncated: false, byte: b'm' }] ); assert_eq!( // This is the kitty curly underline sequence. parse_as_vec(b"\x1b[4:3m"), vec![VTAction::CsiDispatch { params: vec![ CsiParam::Integer(4), CsiParam::P(b':'), CsiParam::Integer(3) ], parameters_truncated: false, byte: b'm' }] ); } #[test] fn test_colon_rgb() { assert_eq!( parse_as_vec(b"\x1b[38:2::128:64:192m"), vec![VTAction::CsiDispatch { params: vec![ CsiParam::Integer(38), CsiParam::P(b':'), CsiParam::Integer(2), CsiParam::P(b':'), CsiParam::P(b':'), CsiParam::Integer(128), CsiParam::P(b':'), CsiParam::Integer(64), CsiParam::P(b':'), CsiParam::Integer(192), ], parameters_truncated: false, byte: b'm' }] ); } #[test] fn test_csi_omitted_param() { assert_eq!( parse_as_vec(b"\x1b[;1m"), vec![VTAction::CsiDispatch { params: vec![CsiParam::P(b';'), CsiParam::Integer(1)], parameters_truncated: false, byte: b'm' }] ); } #[test] fn test_csi_too_many_params() { assert_eq!( parse_as_vec(b"\x1b[0;1;2;3;4;5;6;7;8;9;0;1;2;3;4;51;6p"), vec![VTAction::CsiDispatch { params: vec![ CsiParam::Integer(0), CsiParam::P(b';'), CsiParam::Integer(1), CsiParam::P(b';'), CsiParam::Integer(2), CsiParam::P(b';'), CsiParam::Integer(3), CsiParam::P(b';'), CsiParam::Integer(4), CsiParam::P(b';'), CsiParam::Integer(5), CsiParam::P(b';'), CsiParam::Integer(6), CsiParam::P(b';'), CsiParam::Integer(7), CsiParam::P(b';'), CsiParam::Integer(8), CsiParam::P(b';'), CsiParam::Integer(9), CsiParam::P(b';'), CsiParam::Integer(0), CsiParam::P(b';'), CsiParam::Integer(1), CsiParam::P(b';'), CsiParam::Integer(2), CsiParam::P(b';'), CsiParam::Integer(3), CsiParam::P(b';'), CsiParam::Integer(4), CsiParam::P(b';'), CsiParam::Integer(51), CsiParam::P(b';'), ], parameters_truncated: false, byte: b'p' }] ); } #[test] fn test_csi_intermediates() { assert_eq!( parse_as_vec(b"\x1b[1 p"), vec![VTAction::CsiDispatch { params: vec![CsiParam::Integer(1), CsiParam::P(b' ')], parameters_truncated: false, byte: b'p' }] ); assert_eq!( parse_as_vec(b"\x1b[1 !p"), vec![VTAction::CsiDispatch { params: vec![CsiParam::Integer(1), CsiParam::P(b' '), CsiParam::P(b'!')], parameters_truncated: false, byte: b'p' }] ); assert_eq!( parse_as_vec(b"\x1b[1 !#p"), vec![VTAction::CsiDispatch { // Note that the `#` was discarded params: vec![CsiParam::Integer(1), CsiParam::P(b' '), CsiParam::P(b'!')], parameters_truncated: true, byte: b'p' }] ); } #[test] fn osc_utf8() { assert_eq!( parse_as_vec("\x1b]\u{af}\x07".as_bytes()), vec![VTAction::OscDispatch(vec!["\u{af}".as_bytes().to_vec()])] ); } #[test] fn osc_fedora_vte() { assert_eq!( parse_as_vec("\u{9d}777;preexec\u{9c}".as_bytes()), vec![VTAction::OscDispatch(vec![ b"777".to_vec(), b"preexec".to_vec(), ])] ); } #[test] fn print_utf8() { assert_eq!( parse_as_vec("\u{af}".as_bytes()), vec![VTAction::Print('\u{af}')] ); } #[test] fn utf8_control() { assert_eq!( parse_as_vec("\u{8d}".as_bytes()), vec![VTAction::ExecuteC0orC1(0x8d)] ); } #[test] fn tmux_control() { assert_eq!( parse_as_vec("\x1bP1000phello\x1b\\".as_bytes()), vec![ VTAction::DcsHook { byte: b'p', params: vec![1000], intermediates: vec![], ignored_excess_intermediates: false, }, VTAction::DcsPut(b'h'), VTAction::DcsPut(b'e'), VTAction::DcsPut(b'l'), VTAction::DcsPut(b'l'), VTAction::DcsPut(b'o'), VTAction::DcsUnhook, VTAction::EscDispatch { params: vec![], intermediates: vec![], ignored_excess_intermediates: false, byte: b'\\', } ] ); } #[test] fn tmux_passthru() { // I'm not convinced that we *should* represent this tmux sequence // in this way, but it is how it currently maps. // It's worth noting that we see this as final byte `t` here, which // collides with decVT105G in https://vt100.net/emu/dcsseq_dec.html assert_eq!( parse_as_vec("\x1bPtmux;data\x1b\\".as_bytes()), vec![ VTAction::DcsHook { byte: b't', params: vec![], intermediates: vec![], ignored_excess_intermediates: false, }, VTAction::DcsPut(b'm'), VTAction::DcsPut(b'u'), VTAction::DcsPut(b'x'), VTAction::DcsPut(b';'), VTAction::DcsPut(b'd'), VTAction::DcsPut(b'a'), VTAction::DcsPut(b't'), VTAction::DcsPut(b'a'), VTAction::DcsUnhook, VTAction::EscDispatch { params: vec![], intermediates: vec![], ignored_excess_intermediates: false, byte: b'\\', } ] ); } #[test] fn kitty_img() { assert_eq!( parse_as_vec("\x1b_Gf=24,s=10,v=20;payload\x1b\\".as_bytes()), vec![ VTAction::ApcDispatch(b"Gf=24,s=10,v=20;payload".to_vec()), VTAction::EscDispatch { params: vec![], intermediates: vec![], ignored_excess_intermediates: false, byte: b'\\', } ] ); } #[test] fn sixel() { assert_eq!( parse_as_vec("\x1bPqhello\x1b\\".as_bytes()), vec![ VTAction::DcsHook { byte: b'q', params: vec![], intermediates: vec![], ignored_excess_intermediates: false, }, VTAction::DcsPut(b'h'), VTAction::DcsPut(b'e'), VTAction::DcsPut(b'l'), VTAction::DcsPut(b'l'), VTAction::DcsPut(b'o'), VTAction::DcsUnhook, VTAction::EscDispatch { params: vec![], intermediates: vec![], ignored_excess_intermediates: false, byte: b'\\', } ] ); } } vtparse-0.6.2/src/transitions.rs000064400000000000000000000265410072674642500150670ustar 00000000000000// Builds up the transition table for a state machine based on // https://vt100.net/emu/dec_ansi_parser use crate::enums::{Action, State}; /// Apply all u8 values to `fn(u8) -> u16`, return `[u16; 256]`. macro_rules! define_table { ( $func:tt ) => {{ const fn gen() -> [u16; 256] { let mut arr = [0; 256]; let mut i = 0; while i < 256 { arr[i] = $func(i as u8); i += 1; } return arr; } gen() }}; } const fn pack(action: Action, state: State) -> u16 { ((action as u16) << 8) | (state as u16) } const fn anywhere_or(i: u8, state: State) -> u16 { use Action::*; use State::*; match i { 0x18 => pack(Execute, Ground), 0x1a => pack(Execute, Ground), 0x80..=0x8f => pack(Execute, Ground), 0x91..=0x97 => pack(Execute, Ground), 0x99 => pack(Execute, Ground), 0x9a => pack(Execute, Ground), 0x9c => pack(None, Ground), 0x1b => pack(None, Escape), 0x98 => pack(None, SosPmString), 0x9e => pack(None, SosPmString), 0x9f => pack(None, SosPmString), 0x90 => pack(None, DcsEntry), 0x9d => pack(None, OscString), 0x9b => pack(None, CsiEntry), _ => pack(None, state), } } const fn ground(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Execute, Ground), 0x19 => pack(Execute, Ground), 0x1c..=0x1f => pack(Execute, Ground), 0x20..=0x7f => pack(Print, Ground), // The following three ranges allow for // UTF-8 multibyte sequences to be recognized // and emitted as byte sequences in the ground // state. 0xc2..=0xdf => pack(Utf8, Utf8Sequence), 0xe0..=0xef => pack(Utf8, Utf8Sequence), 0xf0..=0xf4 => pack(Utf8, Utf8Sequence), _ => anywhere_or(i, Ground), } } const fn escape(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Execute, Escape), 0x19 => pack(Execute, Escape), 0x1c..=0x1f => pack(Execute, Escape), 0x7f => pack(Ignore, Escape), 0x20..=0x2f => pack(Collect, EscapeIntermediate), 0x30..=0x4f => pack(EscDispatch, Ground), 0x51..=0x57 => pack(EscDispatch, Ground), 0x59 => pack(EscDispatch, Ground), 0x5a => pack(EscDispatch, Ground), 0x5c => pack(EscDispatch, Ground), 0x60..=0x7e => pack(EscDispatch, Ground), 0x5b => pack(None, CsiEntry), 0x5d => pack(None, OscString), 0x50 => pack(None, DcsEntry), 0x58 => pack(None, SosPmString), 0x5e => pack(None, SosPmString), 0x5f => pack(None, ApcString), _ => anywhere_or(i, Escape), } } const fn escape_intermediate(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Execute, EscapeIntermediate), 0x19 => pack(Execute, EscapeIntermediate), 0x1c..=0x1f => pack(Execute, EscapeIntermediate), 0x20..=0x2f => pack(Collect, EscapeIntermediate), 0x7f => pack(Ignore, EscapeIntermediate), 0x30..=0x7e => pack(EscDispatch, Ground), _ => anywhere_or(i, EscapeIntermediate), } } const fn csi_entry(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Execute, CsiEntry), 0x19 => pack(Execute, CsiEntry), 0x1c..=0x1f => pack(Execute, CsiEntry), 0x7f => pack(Ignore, CsiEntry), 0x20..=0x2f => pack(Collect, CsiIntermediate), 0x3a => pack(None, CsiIgnore), 0x30..=0x39 => pack(Param, CsiParam), 0x3b => pack(Param, CsiParam), 0x3c..=0x3f => pack(Collect, CsiParam), 0x40..=0x7e => pack(CsiDispatch, Ground), _ => anywhere_or(i, CsiEntry), } } const fn csi_param(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Execute, CsiParam), 0x19 => pack(Execute, CsiParam), 0x1c..=0x1f => pack(Execute, CsiParam), 0x30..=0x3b => pack(Param, CsiParam), 0x7f => pack(Ignore, CsiParam), 0x3c..=0x3f => pack(None, CsiIgnore), 0x20..=0x2f => pack(Collect, CsiIntermediate), 0x40..=0x7e => pack(CsiDispatch, Ground), _ => anywhere_or(i, CsiParam), } } const fn csi_intermediate(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Execute, CsiIntermediate), 0x19 => pack(Execute, CsiIntermediate), 0x1c..=0x1f => pack(Execute, CsiIntermediate), 0x20..=0x2f => pack(Collect, CsiIntermediate), 0x7f => pack(Ignore, CsiIntermediate), 0x30..=0x3f => pack(None, CsiIgnore), 0x40..=0x7e => pack(CsiDispatch, Ground), _ => anywhere_or(i, CsiIntermediate), } } const fn csi_ignore(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Execute, CsiIgnore), 0x19 => pack(Execute, CsiIgnore), 0x1c..=0x1f => pack(Execute, CsiIgnore), 0x20..=0x3f => pack(Ignore, CsiIgnore), 0x7f => pack(Ignore, CsiIgnore), 0x40..=0x7e => pack(None, Ground), _ => anywhere_or(i, CsiIgnore), } } const fn dcs_entry(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Ignore, DcsEntry), 0x19 => pack(Ignore, DcsEntry), 0x1c..=0x1f => pack(Ignore, DcsEntry), 0x7f => pack(Ignore, DcsEntry), 0x3a => pack(None, DcsIgnore), 0x20..=0x2f => pack(Collect, DcsIntermediate), 0x30..=0x39 => pack(Param, DcsParam), 0x3b => pack(Param, DcsParam), 0x3c..=0x3f => pack(Collect, DcsParam), 0x40..=0x7e => pack(None, DcsPassthrough), _ => anywhere_or(i, DcsEntry), } } const fn dcs_param(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Ignore, DcsParam), 0x19 => pack(Ignore, DcsParam), 0x1c..=0x1f => pack(Ignore, DcsParam), 0x30..=0x39 => pack(Param, DcsParam), 0x3b => pack(Param, DcsParam), 0x7f => pack(Ignore, DcsParam), 0x3a => pack(None, DcsIgnore), 0x3c..=0x3f => pack(None, DcsIgnore), 0x20..=0x2f => pack(Collect, DcsIntermediate), 0x40..=0x7e => pack(None, DcsPassthrough), _ => anywhere_or(i, DcsParam), } } const fn dcs_intermediate(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Ignore, DcsIntermediate), 0x19 => pack(Ignore, DcsIntermediate), 0x1c..=0x1f => pack(Ignore, DcsIntermediate), 0x20..=0x2f => pack(Collect, DcsIntermediate), 0x7f => pack(Ignore, DcsIntermediate), 0x30..=0x3f => pack(None, DcsIgnore), 0x40..=0x7e => pack(None, DcsPassthrough), _ => anywhere_or(i, DcsIntermediate), } } const fn dcs_passthrough(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Put, DcsPassthrough), 0x19 => pack(Put, DcsPassthrough), 0x1c..=0x1f => pack(Put, DcsPassthrough), 0x20..=0x7e => pack(Put, DcsPassthrough), 0x7f => pack(Ignore, DcsPassthrough), _ => anywhere_or(i, DcsPassthrough), } } const fn dcs_ignore(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Ignore, DcsIgnore), 0x19 => pack(Ignore, DcsIgnore), 0x1c..=0x1f => pack(Ignore, DcsIgnore), 0x20..=0x7f => pack(Ignore, DcsIgnore), _ => anywhere_or(i, DcsIgnore), } } const fn osc_string(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x06 => pack(Ignore, OscString), // Using BEL in place of ST is a deviation from // https://vt100.net/emu/dec_ansi_parser and was // introduced AFAICT by xterm 0x07 => pack(Ignore, Ground), 0x08..=0x17 => pack(Ignore, OscString), 0x19 => pack(Ignore, OscString), 0x1c..=0x1f => pack(Ignore, OscString), 0x20..=0x7f => pack(OscPut, OscString), // This extended range allows for UTF-8 characters // to be embedded in OSC parameters. It is not // part of the base state machine. 0xc2..=0xdf => pack(Utf8, Utf8Sequence), 0xe0..=0xef => pack(Utf8, Utf8Sequence), 0xf0..=0xf4 => pack(Utf8, Utf8Sequence), _ => anywhere_or(i, OscString), } } const fn sos_pm_string(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(Ignore, SosPmString), 0x19 => pack(Ignore, SosPmString), 0x1c..=0x1f => pack(Ignore, SosPmString), 0x20..=0x7f => pack(Ignore, SosPmString), _ => anywhere_or(i, SosPmString), } } const fn apc_string(i: u8) -> u16 { use Action::*; use State::*; match i { 0x00..=0x17 => pack(ApcPut, ApcString), 0x19 => pack(ApcPut, ApcString), 0x1c..=0x1f => pack(ApcPut, ApcString), 0x20..=0x7f => pack(ApcPut, ApcString), _ => anywhere_or(i, ApcString), } } pub(crate) static TRANSITIONS: [[u16; 256]; 15] = [ define_table!(ground), define_table!(escape), define_table!(escape_intermediate), define_table!(csi_entry), define_table!(csi_param), define_table!(csi_intermediate), define_table!(csi_ignore), define_table!(dcs_entry), define_table!(dcs_param), define_table!(dcs_intermediate), define_table!(dcs_passthrough), define_table!(dcs_ignore), define_table!(osc_string), define_table!(sos_pm_string), define_table!(apc_string), ]; pub(crate) static ENTRY: [Action; 17] = [ Action::None, // Ground Action::Clear, // Escape Action::None, // EscapeIntermediate Action::Clear, // CsiEntry Action::None, // CsiParam Action::None, // CsiIntermediate Action::None, // CsiIgnore Action::Clear, // DcsEntry Action::None, // DcsParam Action::None, // DcsIntermediate Action::Hook, // DcsPassthrough Action::None, // DcsIgnore Action::OscStart, // OscString Action::None, // SosPmString Action::ApcStart, // ApcString Action::None, // Anywhere Action::None, // Utf8Sequence ]; pub(crate) static EXIT: [Action; 17] = [ Action::None, // Ground Action::None, // Escape Action::None, // EscapeIntermediate Action::None, // CsiEntry Action::None, // CsiParam Action::None, // CsiIntermediate Action::None, // CsiIgnore Action::None, // DcsEntry Action::None, // DcsParam Action::None, // DcsIntermediate Action::Unhook, // DcsPassthrough Action::None, // DcsIgnore Action::OscEnd, // OscString Action::None, // SosPmString Action::ApcEnd, // ApcString Action::None, // Anywhere Action::None, // Utf8Sequence ]; #[cfg(test)] mod tests { use super::*; #[test] fn test_transitions() { let v = format!("{:?}", TRANSITIONS).as_bytes().to_vec(); assert_eq!( ( v.len(), hash(&v, 0, 1), hash(&v, 5381, 33), // djb2 hash(&v, 0, 65599), // sdbm ), (17385, 799944, 12647816782590382477, 3641575052870461598) ); } fn hash(v: &[u8], init: u64, mul: u64) -> u64 { v.iter() .fold(init, |a, &b| a.wrapping_mul(mul).wrapping_add(b as u64)) } }