pax_global_header00006660000000000000000000000064147173704760014532gustar00rootroot0000000000000052 comment=b2a1dc9967b556f9a576ce0f7ded2a457ba4aefa Richienb-char-regex-11dd55b/000077500000000000000000000000001471737047600157075ustar00rootroot00000000000000Richienb-char-regex-11dd55b/.editorconfig000066400000000000000000000002761471737047600203710ustar00rootroot00000000000000# editorconfig.org root = true [*] end_of_line = lf charset = utf-8 trim_trailing_whitespace = true insert_final_newline = true indent_style = tab [*.md] trim_trailing_whitespace = false Richienb-char-regex-11dd55b/.github/000077500000000000000000000000001471737047600172475ustar00rootroot00000000000000Richienb-char-regex-11dd55b/.github/workflows/000077500000000000000000000000001471737047600213045ustar00rootroot00000000000000Richienb-char-regex-11dd55b/.github/workflows/main.yml000066400000000000000000000006451471737047600227600ustar00rootroot00000000000000name: CI on: - push - pull_request jobs: test: name: Node.js ${{ matrix.node-version }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: node-version: - 20 - 18 steps: - uses: actions/checkout@v2 - uses: actions/setup-node@v2 with: node-version: ${{ matrix.node-version }} - run: npm install - run: npm test Richienb-char-regex-11dd55b/.npmrc000066400000000000000000000000231471737047600170220ustar00rootroot00000000000000package-lock=false Richienb-char-regex-11dd55b/fixture/000077500000000000000000000000001471737047600173755ustar00rootroot00000000000000Richienb-char-regex-11dd55b/fixture/all-telugu-chars.js000066400000000000000000000061431471737047600231100ustar00rootroot00000000000000// Using visual code point representation for better human intelligibility const independentVowels = ['అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ౠ', 'ఌ', 'ౡ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ']; const diacriticVowels = ['ా', 'ి', 'ీ', 'ు', 'ూ', 'ృ', 'ౄ', 'ౢ', 'ౣ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ']; const consonants = ['క', 'ఖ', 'గ', 'ఘ', 'ఙ', 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ల', 'వ', 'ళ', 'శ', 'ష', 'స', 'హ', 'ఱ']; const rareConsonants = ['ౘ', 'ౙ', 'ౚ']; const modifiers = ['్', 'ఁ', 'ం', 'ః', 'ౕ', 'ౖ']; const numerals = ['౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯', '౸', '౹', '౺', '౻', '౼', '౽', '౾']; const virama = '్'; const doubleCombos = []; // Telugu symbols built out of two code points const tripleCombos = []; // Telugu symbols built out of three code points // Consonants can be combined with many other character modifiers for (const consonant of consonants) { // Consonant + vowel for (const vowel of diacriticVowels) { doubleCombos.push(String.fromCodePoint(consonant.codePointAt(0), vowel.codePointAt(0))); } // Consonant + special vowel modifier or length mark for (const modifier of modifiers) { doubleCombos.push(String.fromCodePoint(consonant.codePointAt(0), modifier.codePointAt(0))); } // Consonant + consonant (separated by ్) for (const consonant2 of consonants) { tripleCombos.push(String.fromCodePoint(consonant.codePointAt(0), virama.codePointAt(0), consonant2.codePointAt(0))); } } // Rare consonants like common consonants, but lack the consonant conjuncts for (const consonant of rareConsonants) { // Rare consonant + vowel for (const vowel of diacriticVowels) { doubleCombos.push(String.fromCodePoint(consonant.codePointAt(0), vowel.codePointAt(0))); } // Rare consonant + special vowel modifier or length mark for (const modifier of modifiers) { doubleCombos.push(String.fromCodePoint(consonant.codePointAt(0), modifier.codePointAt(0))); } } /** Create all single Telugu characters possible. @return {string[]} All single Telugu chars possible. */ export function createAllTeluguChars() { return [...independentVowels, ...consonants, ...rareConsonants, ...numerals, ...doubleCombos, ...tripleCombos]; } /** Create Telugu character pairs that might occur. Although it's possible in theory to create missed cases by combining certain single characters, they are hopefully not something that would happen in written Telugu. @return {string[]} Telugu char pairs. */ export function createTeluguCharPairs() { const sampleCharsOneCodePoint = [...independentVowels, ...consonants, ...rareConsonants, ...numerals]; const sampleCharsTwoCodePoints = [...doubleCombos]; const charPairs = []; for (const char1 of sampleCharsOneCodePoint) { for (const char2 of sampleCharsOneCodePoint) { charPairs.push(`${char1}${char2}`); } for (const char2 of sampleCharsTwoCodePoints) { charPairs.push(`${char1}${char2}`); } } return charPairs; } Richienb-char-regex-11dd55b/index.d.ts000066400000000000000000000004761471737047600176170ustar00rootroot00000000000000/** @returns A regex to match any full character, considering weird character ranges. @example ``` import charRegex from 'char-regex'; '❤️👊🏽'.match(/./); //=> ['', '', '', '', '', '', ''] '❤️👊🏽'.match(charRegex()); //=> ['❤️', '👊🏽'] ``` */ export default function charRegex(): RegExp; Richienb-char-regex-11dd55b/index.js000066400000000000000000000052071471737047600173600ustar00rootroot00000000000000// Based on https://github.com/lodash/lodash/blob/6018350ac10d5ce6a5b7db625140b82aeab804df/.internal/unicodeSize.js export default function charRegex() { // Unicode character classes const astralRange = '\\ud800-\\udfff'; const comboMarksRange = '\\u0300-\\u036f'; const comboHalfMarksRange = '\\ufe20-\\ufe2f'; const comboSymbolsRange = '\\u20d0-\\u20ff'; const comboMarksExtendedRange = '\\u1ab0-\\u1aff'; const comboMarksSupplementRange = '\\u1dc0-\\u1dff'; const comboRange = comboMarksRange + comboHalfMarksRange + comboSymbolsRange + comboMarksExtendedRange + comboMarksSupplementRange; const variableRange = '\\ufe0e\\ufe0f'; // Telugu characters const teluguVowels = '\\u0c05-\\u0c0c\\u0c0e-\\u0c10\\u0c12-\\u0c14\\u0c60-\\u0c61'; const teluguVowelsDiacritic = '\\u0c3e-\\u0c44\\u0c46-\\u0c48\\u0c4a-\\u0c4c\\u0c62-\\u0c63'; const teluguConsonants = '\\u0c15-\\u0c28\\u0c2a-\\u0c39'; const teluguConsonantsRare = '\\u0c58-\\u0c5a'; const teluguModifiers = '\\u0c01-\\u0c03\\u0c4d\\u0c55\\u0c56'; const teluguNumerals = '\\u0c66-\\u0c6f\\u0c78-\\u0c7e'; const teluguSingle = `[${teluguVowels}${teluguNumerals}${teluguConsonantsRare}]|[${teluguConsonants}](?!\\u0c4d)`; const teluguDouble = `[${teluguConsonants}${teluguConsonantsRare}][${teluguVowelsDiacritic}]|[${teluguConsonants}${teluguConsonantsRare}][${teluguModifiers}]`; const teluguTriple = `[${teluguConsonants}]\\u0c4d[${teluguConsonants}]`; const telugu = `(?:${teluguTriple}|${teluguDouble}|${teluguSingle})`; // Unicode capture groups const astral = `[${astralRange}]`; const combo = `[${comboRange}]`; const fitz = '\\ud83c[\\udffb-\\udfff]'; const modifier = `(?:${combo}|${fitz})`; const nonAstral = `[^${astralRange}]`; const regional = '(?:\\ud83c[\\udde6-\\uddff]){2}'; const surrogatePair = '[\\ud800-\\udbff][\\udc00-\\udfff]'; const zeroWidthJoiner = '\\u200d'; const blackFlag = '(?:\\ud83c\\udff4\\udb40\\udc67\\udb40\\udc62\\udb40(?:\\udc65|\\udc73|\\udc77)\\udb40(?:\\udc6e|\\udc63|\\udc6c)\\udb40(?:\\udc67|\\udc74|\\udc73)\\udb40\\udc7f)'; // Unicode regexes const optModifier = `${modifier}?`; const optVariable = `[${variableRange}]?`; const optJoin = `(?:${zeroWidthJoiner}(?:${[nonAstral, regional, surrogatePair].join('|')})${optVariable + optModifier})*`; const seq = optVariable + optModifier + optJoin; const nonAstralCombo = `${nonAstral}${combo}?`; const symbol = `(?:${[blackFlag, nonAstralCombo, combo, regional, surrogatePair, astral].join('|')})`; // Match string symbols (https://mathiasbynens.be/notes/javascript-unicode) return new RegExp(`${fitz}(?=${fitz})|${telugu}|${symbol + seq}`, 'g'); // eslint-disable-line no-misleading-character-class } Richienb-char-regex-11dd55b/index.test-d.ts000066400000000000000000000001461471737047600205660ustar00rootroot00000000000000import {expectType} from 'tsd'; import charRegex from './index.js'; expectType(charRegex()); Richienb-char-regex-11dd55b/license000066400000000000000000000020661471737047600172600ustar00rootroot00000000000000MIT License Copyright (c) 2020 - 2024 Richie Bendall Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Richienb-char-regex-11dd55b/package.json000066400000000000000000000012241471737047600201740ustar00rootroot00000000000000{ "name": "char-regex", "version": "2.0.2", "description": "A regex to match any full character, considering weird character ranges.", "repository": "Richienb/char-regex", "author": { "name": "Richie Bendall", "email": "richiebendall@gmail.com" }, "license": "MIT", "type": "module", "exports": "./index.js", "files": [ "index.js", "index.d.ts" ], "engines": { "node": ">=12.20" }, "scripts": { "test": "xo && ava && tsd" }, "keywords": [ "character", "regex", "match", "split", "length" ], "dependencies": {}, "devDependencies": { "all-chars": "^1.0.0", "ava": "^6.2.0", "tsd": "^0.31.2", "xo": "^0.59.3" } } Richienb-char-regex-11dd55b/readme.md000066400000000000000000000010101471737047600174560ustar00rootroot00000000000000# char-regex A regex to match any full character, considering weird character ranges. Tested on every single emoji and unicode character. Based on the Lodash implementation. ## Install ```sh npm install char-regex ``` ## Usage ```js import charRegex from 'char-regex'; '❤️👊🏽'.match(/./); //=> ['', '', '', '', '', '', ''] '❤️👊🏽'.match(charRegex()); //=> ['❤️', '👊🏽'] ``` ## Related - [string-length](https://github.com/sindresorhus/string-length) - Get the real length of a string Richienb-char-regex-11dd55b/test.js000066400000000000000000000027611471737047600172320ustar00rootroot00000000000000import test from 'ava'; import createAllChars from 'all-chars'; import {createAllTeluguChars, createTeluguCharPairs} from './fixture/all-telugu-chars.js'; import createCharRegex from './index.js'; const allChars = createAllChars(); const allTeluguChars = createAllTeluguChars(); const sampleTeluguCharPairs = createTeluguCharPairs(); const charRegex = createCharRegex(); function getCodePoints(string) { let result = ''; for (let index = 0; index < string.length; index++) { result += `\\u${string.codePointAt(index)}`; } return result; } // See https://mathiasbynens.be/notes/javascript-unicode#poo-test test('The Pile of Poo Test™', t => { t.deepEqual('Iñtërnâtiônàlizætiøn☃💩'.match(charRegex), [ 'I', 'ñ', 't', 'ë', 'r', 'n', 'â', 't', 'i', 'ô', 'n', 'à', 'l', 'i', 'z', 'æ', 't', 'i', 'ø', 'n', '☃', '💩', ]); }); // Test for Telugu language with custom code point combinations for (const character of allTeluguChars) { test(`Test Telugu "${character}" (${getCodePoints(character)})`, t => { t.deepEqual(character.match(charRegex), [character]); }); } // Test for Telugu language generating certain char pairs for (const characters of sampleTeluguCharPairs) { test(`Test Telugu char pairs "${characters}" (${getCodePoints(characters)})`, t => { t.true(characters.match(charRegex).length === 2); }); } for (const character of allChars) { test(`Test "${character}" (${getCodePoints(character)})`, t => { t.deepEqual(character.match(charRegex), [character]); }); }