pax_global_header00006660000000000000000000000064151375520630014521gustar00rootroot0000000000000052 comment=ddf012db65a9954ec1756293d241278a3f278ac6 wn-1.0.0/000077500000000000000000000000001513755206300121435ustar00rootroot00000000000000wn-1.0.0/.github/000077500000000000000000000000001513755206300135035ustar00rootroot00000000000000wn-1.0.0/.github/ISSUE_TEMPLATE/000077500000000000000000000000001513755206300156665ustar00rootroot00000000000000wn-1.0.0/.github/ISSUE_TEMPLATE/bug_report.md000066400000000000000000000021121513755206300203540ustar00rootroot00000000000000--- name: Bug report about: Create a report to help us improve title: '' labels: bug assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. :warning: If this is a question about Wn or how to use it, please create a [discussion](https://github.com/goodmami/wn/discussions) instead of an issue. **To Reproduce** Please enter a minimal working example of the command or Python code that illustrates the problem. To avoid formatting issues, enter the code in a Markdown code block: ```console $ python -m wn ... output... ``` or ```pycon >>> import wn >>> ... output ``` **Expected behavior** A clear and concise description of what you expected to happen. **Environment** Please enter the versions of Python and Wn you are using as well as the installed lexicons. You can find these by executing the following commands (adjust your platform-specific Python command as necessary, e.g., `python3` or `py -3`): ```console python --version python -m wn --version python -m wn lexicons ``` **Additional context** Add any other context about the problem here. wn-1.0.0/.github/ISSUE_TEMPLATE/data-issue.md000066400000000000000000000021351513755206300202500ustar00rootroot00000000000000--- name: Data issue about: Report an issue Wn's data index title: '' labels: data assignees: '' --- **If your issue is regarding the contents of the data** (e.g., a lexicon is missing a word, synset, relation, etc.), then please find the upstream project and file the issue there. You can find links to the projects on Wn's [README](https://github.com/goodmami/wn/). Projects without links are probably managed by the [Open Multilingual Wordnet](https://github.com/omwn/omw-data). **Use this issue template for the following kinds of issues:** 1. Request a wordnet lexicon (including new versions of existing lexicons) to be indexed by Wn Please provide: - the project name - the name and contact info of the current maintainer - the language of the lexicon (BCP-47 code preferred) - a URL to the project (e.g., on GitHub or other homepage) - a URL to the [WN-LMF](https://github.com/globalwordnet/schemas/) resource 2. Report an issue with an indexed lexicon (e.g., the source URL has changed) Please indicate the lexicon id and version and the correct project information, if available. wn-1.0.0/.github/ISSUE_TEMPLATE/feature_request.md000066400000000000000000000011341513755206300214120ustar00rootroot00000000000000--- name: Feature request about: Suggest an idea for this project title: '' labels: enhancement assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. wn-1.0.0/.github/workflows/000077500000000000000000000000001513755206300155405ustar00rootroot00000000000000wn-1.0.0/.github/workflows/checks.yml000066400000000000000000000017011513755206300175220ustar00rootroot00000000000000name: tests on: push: branches: [main] pull_request: branches: [main] jobs: lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: python-version: "3.10" - name: Install Hatch run: pipx install hatch - name: Lint run: hatch fmt --linter --check - name: Type Check run: hatch run mypy:check - name: Check Buildable run: hatch build tests: runs-on: ${{ matrix.os }} strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] os: [ubuntu-latest, windows-latest] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install Hatch run: pipx install hatch - name: Test run: hatch test wn-1.0.0/.github/workflows/publish.yml000066400000000000000000000034101513755206300177270ustar00rootroot00000000000000name: Build and Publish to PyPI or TestPyPI on: push jobs: build: name: Build distribution runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: python-version: "3.x" - name: Install Hatch run: pipx install hatch - name: Build run: hatch build - name: Store the distribution packages uses: actions/upload-artifact@v4 with: name: python-package-distributions path: dist/ publish-to-pypi: name: Publish distributions to PyPI if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes needs: - build runs-on: ubuntu-latest environment: name: pypi url: https://pypi.org/p/wn permissions: id-token: write # IMPORTANT: mandatory for trusted publishing steps: - name: Download the dists uses: actions/download-artifact@v4.1.8 with: name: python-package-distributions path: dist/ - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 publish-to-testpypi: name: Publish distributions to TestPyPI needs: - build runs-on: ubuntu-latest environment: name: testpypi url: https://test.pypi.org/p/wn permissions: id-token: write # IMPORTANT: mandatory for trusted publishing steps: - name: Download the dists uses: actions/download-artifact@v4.1.8 with: name: python-package-distributions path: dist/ - name: Publish to TestPyPI uses: pypa/gh-action-pypi-publish@release/v1 with: repository-url: https://test.pypi.org/legacy/ skip-existing: true wn-1.0.0/.gitignore000066400000000000000000000014331513755206300141340ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Ruff (has its own .gitignore, but in case that ever changes...) .ruff_cache # Sphinx documentation docs/_build/ # Jupyter Notebook .ipynb_checkpoints # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # mypy .mypy_cache/ .dmypy.json dmypy.json # PyCharm .idea/ # VS Code .vscode/ # benchmarking results .benchmarks/wn-1.0.0/CHANGELOG.md000066400000000000000000000717301513755206300137640ustar00rootroot00000000000000# Change Log ## [Unreleased][unreleased] Notable changes in this release: * A new version of the database schema requires a database rebuild * A new `wn.ili` module deals with ILI files and objects; interlingual queries still use the `Synset.ili` member, which is now a simple `str` * The Open English Wordnet versions 2025 and 2025+ are added to the index ### Index * Add `oewn:2025` ([#294]) * Add `oewn:2025+` ([#294]) * Add `omw:2.0`, including `2.0` versions of individual OMW lexicons ([#300]) ### Schema * Add `specifier` column to `lexicon` table ([#234]) * Remove `lexicalized` column from `synsets` and `senses` ([#248]) * Add `unlexicalized_synsets` and `unlexicalized_senses` tables ([#248]) * Add `lexicon_rowid` column to `pronunciations` and `tags` ([#303]) ### Added * `wn.lemmas()` function and `Wordnet.lemmas()` method to query all lemmas at once. * Support for WN-LMF 1.4 ([#260]) - Sense ordering: `index` on `` and `n` on `` - New sense relations: - `metaphor` - `has_metaphor` - `metonym` - `has_metonym` - `agent` - `material` - `event` - `instrument` - `location` - `by_means_of` - `undergoer` - `property` - `result` - `state` - `uses` - `destination` - `body_part` - `vehicle` - `ref` attribute for `` and `` ([#301]) * `wn.ili` module * `wn.Sense.synset_relations()` ([#271]) * `wn.Pronunciation.lexicon()` method ([#303]) * `wn.Tag.lexicon()` method ([#303]) * Support for exporting lexicon extensions ([#103]) * `wn.compat.sensekey` supports the `oewn-v2` flavor for escaping and unescaping for the scheme used by OEWN 2025 ([#292]) * `wn.compat.sensekey` supports the `oewn:2025` and `oewn:2025+` lexicons for the `sense_key_getter` and `sense_getter` functions ([#292]) * `wn.reset_database()` function for reinitializing an outdated database. ### Removed * `wn.web` module ([#295]) * `wn.Synset.relation_map()` method ([#271]) * `wn.Sense.relation_map()` method ([#271]) ### Changed * Default form normalizer uses casefold instead of lower ([#233]) * `Synset.ili` is a `str` instead of an `ILI` object. * `Wordnet.synsets()` method and `wn.synsets()` function's only accepts `ili` `str` arguments for the `ili` parameter again, reverting a change from v0.12.0. This is because `Synset.ili` is now a simple string and `ILI` objects are no longer part of the core `wn` package namespace. * `wn.Synset.relations()`: return `wn.Relation` to `wn.Synset` mapping when using `data=True` ([#271]) * `wn.Sense.relations()`: return `wn.Relation` to `wn.Sense` mapping when using `data=True` ([#271]) * Queries of relations can specify different lexicons for source and target (part of [#103]; not a user-facing change) ### Fixed * WN-LMF 1.1+ `` exported properly ([#302]) * WN-LMF 1.1+ `subcat` attribute exported properly ([#302]) ### Documentation * Correct docstring for `wn.taxonomy.taxonomy_depth()` ([#291]) ## [v0.14.0] **Release date: 2025-11-16** ### Python Support * Removed support for Python 3.9 * Added support for Python 3.14 ### Added * Preliminary XML-only support for WN-LMF 1.4 ([#260]) * `lexicon()` method on `Form`, `Example`, `Definition`, and `Count` ([#286]) * `confidence()` method ([#263]) - On `Lexicon` defaults to 1.0 - On existing `ILI`, defaults to 1.0 - On `Word`, `Sense`, `Synset`, `Relation`, `Example`, `Definition`, and `Count`, defaults to the confidence of their lexicon. * `/` (index) and `/health` endpoints for `wn.web` (see [#268]) ### Changed * `wn.web`: returns `JSONResponse` on most errors ([#277]) ### Fixed * Encode example metadata on export ([#285]) * Update LMF to use `https` in `dc` namespace ### Maintenance * Added `py.typed` file to repository ([#266]) * Use `tomllib` instead of `tomli` for Python 3.11+ ## [v0.13.0] **Release date: 2025-06-13** ### Added * Support for WN-LMF 1.4 ([#260]) * `wn.compat` namespace (see [#55]) * `wn.compat.sensekey` module ([#55]) with methods: - `sense_key_getter()` - `sense_getter()` - `unescape_oewn_sense_key()` - `escape_oewn_sense_key()` * `wn.project.get_project()` ([#53]) * `wn.project.Project` ([#53]) * `wn.project.ResourceOnlyPackage` ([#53]) * `path` property on `wn.project.Project` classes ([#53]) * `delete` parameter on `wn.project.iterpackages()` ([#53]) ### Changed * `wn.add()` allows synset members to be lexical entry IDs for rank calculations ([#255]) * `wn.add()` no longer requires `partOfSpeech` on synsets; this was not a requirement of WN-LMF nor was it enforced in the database * `wn.export()` defaults to `version="1.4"` instead of `"1.0"` ## [v0.12.0] **Release date: 2025-04-22** ### Added * `wn.add_lexical_resource()` to add result of `wn.lmf.load()` to database rather than from a file (pertinent to [#98]) * `bench/` directory with benchmark tests ([#98]) * `Synset.definitions()` ([#246]) ### Fixed * `wn.web` casts URL objects to strings for JSON serialization ([#238]) * Setting `wn.config.data_directory` to an uninitialized directory no longer raises a `sqlite3.OperationalError` ([#250]) ### Changed * `Wordnet` and module-level query functions now issue a warning when the `lang` argument matches more than one lexicon ([#241]) * `Wordnet.synsets()` now accepts `wn.ILI` objects for the `ili` parameter ([#235]) * DB-internal rowids are no longer used outside of SQL queries ([#226]) * The following methods now return standard `str` objects by default and custom classes with a `data=True` argument ([#246]): - `Word.lemma()` - `Word.forms()` - `Sense.examples()` - `Synset.examples()` - `Synset.definition()` * `Sense.counts()` now returns a standard `int` object by default and a custom class with a `data=True` argument ([#246]) * The following classes no longer subclass standard `str` or `int` types and therefore no longer inherit their behavior or interface ([#246]): - `Form` - `Example` - `Definition` - `Count` ## [v0.11.0] **Release date: 2024-12-11** ### Index * Added `oewn:2024` ([#221]) ### Added * `Relation` class ([#216]) * `Sense.relation_map()` method ([#216]) * `Synset.relation_map()` method ([#167], [#216]) * `W305` blank definition on synset validation ([#151]) * `W306` blank example on synset validation ([#151]) * `W307` repeated definition on synset validation ([#151]) ### Fixed * Enumerate repeated entry, sense, synset IDs for validation ([#228]) ## [v0.10.1] **Release date: 2024-10-29** ### Fixed * Follow redirects with `httpx.Client` in `wn._download` ([#211]) * Remove reverse relations for `pertainym` and `also` ([#213]) * Validate redundant relations considering `dc:type` ([#215]) ### Maintenance * Added `docs/.readthedocs.yaml` for building docs ([#214]) ## [v0.10.0] **Release date: 2024-10-29** ### Python Support * Removed support for Python 3.8 ([#202]) * Added support for Python 3.13 ([#202]) ### Added * Support for WN-LMF 1.2 and 1.3 ([#200]) ### Fixed * Don't assume 'id' on form elements in WN-LMF 1.2+ ([#207]) ### Maintenance * Switched packaging from flit to Hatch ([#201]) * Updated dependencies, CI warnings, old workarounds ([#203]) * Change CI publishing to OIDC trusted publishing ## [v0.9.5] **Release date: 2023-12-05** ### Python Support * Removed support for Python 3.7 ([#191]) * Added support for Python 3.12 ([#191]) ### Index * Added `oewn:2023` ([#194]) ## [v0.9.4] **Release date: 2023-05-07** ### Index * Added `oewn:2022` ([#181]) ## [v0.9.3] **Release date: 2022-11-13** ### Python Support * Removed support for Python 3.6 * Added support for Python 3.11 ### Fixed * `wn.Synset.relations()` no longer raises a `KeyError` when no relation types are given and relations are found via ILI ([#177]) ## [v0.9.2] **Release date: 2022-10-02** ### Provisional Changes * The `editor` installation extra installs the `wn-editor` package. This is not a normal way of using extras, as it installs a dependent and not a dependency, and may be removed. ([#17]) ### Fixed * `wn.download()` no longer uses Python features unavailable in 3.7 when recovering from download errors * `Sense.synset()` now creates a `Synset` properly linked to the same `Wordnet` object ([#157], [#168]) * `Sense.word()` now creates a `Word` properly linked to the same `Wordnet` object ([#157]) * `Synset.relations()` uses the correct relation type for those obtained from expand lexicons ([#169]) ## [v0.9.1] **Release date: 2021-11-23** ### Fixed * Correctly add syntactic behaviours for WN-LMF 1.1 lexicons ([#156]) ## [v0.9.0] **Release date: 2021-11-17** ### Added * `wn.constants.REVERSE_RELATIONS` * `wn.validate` module ([#143]) * `validate` subcommand ([#143]) * `wn.Lexicon.describe()` ([#144]) * `wn.Wordnet.describe()` ([#144]) * `wn.ConfigurationError` * `wn.ProjectError` ### Fixed * WN-LMF 1.0 Syntactic Behaviours with no `senses` are now assigned to all senses in the lexical entry. If a WN-LMF 1.1 lexicon extension puts Syntactic Behaviour elements on lexical entries (which it shouldn't) it will only be assigned to senses and external senses listed. * `wn.Form` now always hashes like `str`, so things like `set.__contains__` works as expected. * `wn.download()` raises an exception on bad responses ([#147]]) * Avoid returning duplicate matches when a lemmatizer is used ([#154]) ### Removed * `wn.lmf.dump()` no longer has the `version` parameter ### Changed * `wn.lmf.load()` - returns a dictionary for the resource instead of a list of lexicons, now including the WN-LMF version, as below: ```python { 'lmf_version': '...', 'lexicons': [...] } ``` - returned lexicons are modeled with Python lists and dicts instead of custom classes ([#80]) * `wn.lmf.scan_lexicons()` only returns info about present lexicons, not element counts ([#113]) * Improper configurations (e.g., invalid data directory, malformed index) now raise a `wn.ConfigurationError` * Attempting to get an unknown project or version now raises `wn.ProjectError` instead of `wn.Error` or `KeyError` * Projects and versions in the index now take an `error` key. Calling `wn.config.get_project_info()` on such an entry will raise `wn.ProjectError`. Such entries may not also specify a url. The entry can still be viewed without triggering the error via `wn.config.index`. ([#146]) * Project versions in the index may specify multiple, space-separated URLs on the url key. If one fails, the next will be attempted when downloading. ([#142]) * `wn.config.get_project_info()` now returns a `resource_urls` key mapped to a list of URLs instead of `resource_url` mapped to a single URL. ([#142]) * `wn.config.get_cache_path()` now only accepts URL arguments * The `lexicon` parameter in many functions now allows glob patterns like `omw-*:1.4` ([#155]) ### Index * Added `oewn:2021` new ID, previously `ewn` ([#152]) * Added `own`, `own-pt`, and `own-en` ([#97]) * Added `odenet:1.4` * Added `omw:1.4`, including `omw-en`, formerly `pwn:3.0` ([#152]) * Added `omw-en31:1.4`, formerly `pwn:3.1` ([#152]) * Removed `omw:1.3`, `pwn:3.0`, and `pwn:3.1` ([#152]) * Added `kurdnet:1.0` ([#140]) ## [v0.8.3] **Release date: 2021-11-03** ### Fixed * `wn.lmf` now serialized DC and non-DC metadata correctly ([#148]) ## [v0.8.2] **Release date: 2021-11-01** This release only resolves some dependency issues with the previous release. ## [v0.8.1] **Release date: 2021-10-29** Note: the release on PyPI was yanked because a dependency was not specified properly. ### Fixed * `wn.lmf` uses `https://` for the `dc` namespace instead of `http://`, following the DTD ## [v0.8.0] **Release date: 2021-07-07** ### Added * `wn.ic` module ([#40] * `wn.taxonomy` module ([#125]) * `wn.similarity.res` Resnik similarity ([#122]) * `wn.similarity.jcn` Jiang-Conrath similarity ([#123]) * `wn.similarity.lin` Lin similarity ([#124]) * `wn.util.synset_id_formatter` ([#119]) ### Changed * Taxonomy methods on `wn.Synset` are moved to `wn.taxonomy`, but shortcut methods remain for compatibility ([#125]). * Similarity metrics in `wn.similarity` now raise an error when synsets come from different parts of speech. ## [v0.7.0] **Release date: 2021-06-09** ### Added * Support for approximate word searches; on by default, configurable only by instantiating a `wn.Wordnet` object ([#105]) * `wn.morphy` ([#19]) * `wn.Wordnet.lemmatizer` attribute ([#8]) * `wn.web` ([#116]) * `wn.Sense.relations()` ([#82]) * `wn.Synset.relations()` ([#82]) ### Changed * `wn.lmf.load()` now takes a `progress_handler` parameter ([#46]) * `wn.lmf.scan_lexicons()` no longer returns sets of relation types or lexfiles; `wn.add()` now gets these from loaded lexicons instead * `wn.util.ProgressHandler` - Now has a `refresh_interval` parameter; updates only trigger a refresh after the counter hits the threshold set by the interval - The `update()` method now takes a `force` parameter to trigger a refresh regardless of the refresh interval * `wn.Wordnet` - Initialization now takes a `normalizer` parameter ([#105]) - Initialization now takes a `lemmatizer` parameter ([#8]) - Initialization now takes a `search_all_forms` parameter ([#115]) - `Wordnet.words()`, `Wordnet.senses()` and `Wordnet.synsets()` now use any specified lemmatization or normalization functions to expand queries on word forms ([#105]) ### Fixed * `wn.Synset.ili` for proposed ILIs now works again (#117) ## [v0.6.2] **Release date: 2021-03-22** ### Fixed * Disable `sqlite3` progress reporting after `wn.remove()` ([#108]) ## [v0.6.1] **Release date: 2021-03-05** ### Added * `wn.DatabaseError` as a more specific error type for schema changes ([#106]) ## [v0.6.0] **Release date: 2021-03-04** **Notice:** This release introduces backwards-incompatible changes to the schema that require users upgrading from previous versions to rebuild their database. ### Added * For WN-LMF 1.0 support ([#65]) - `wn.Sense.frames()` - `wn.Sense.adjposition()` - `wn.Tag` - `wn.Form.tags()` - `wn.Count` - `wn.Sense.counts()` * For ILI modeling ([#23]) - `wn.ILI` class - `wn.Wordnet.ili()` - `wn.Wordnet.ilis()` - `wn.ili()` - `wn.ilis()` - `wn.project.Package.type` property - Index entries of different types; default is `'wordnet'`, `'ili'` is also available - Support for detecting and loading ILI tab-separated-value exports; not directly accessible through the public API at this time - Support for adding ILI resources to the database - A CILI index entry ([#23]) * `wn.lmf` WN-LMF 1.1 support ([#7]) - `` - ``, ``, ``, ``, ``, ``, `` - `subcat` on `` - `members` on `` - `lexfile` on `` - `` - `id` on `
` - New relations * Other WN-LMF 1.1 support - `wn.Lexicon.requires()` - `wn.Lexicon.extends()` ([#99]) - `wn.Lexicon.extensions()` ([#99]) - `wn.Pronunciation` ([#7]) - `wn.Form.pronunciations()` ([#7]) - `wn.Form.id` ([#7]) - `wn.Synset.lexfile()` * `wn.constants.SENSE_SYNSET_RELATIONS` * `wn.WnWarning` (related to [#92]) * `wn.Lexicon.modified()` ([#17]) ### Fixed * Adding a wordnet with sense relations with invalid target IDs now raises an error instead of ignoring the relation. * Detect LMF-vs-CILI projects even when files are uncompressed ([#104]) ### Changed * WN-LMF 1.0 entities now modeled and exported to XML ([#65]): - Syntactic behaviour ([#65]) - Adjpositions ([#65]) - Form tags - Sense counts - Definition source senses - ILI definitions * WN-LMF 1.1 entities now modeled and exported to XML ([#89]): - Lexicon requirements and extensions ([#99]) - Form pronunciations - Lexicographer files via the `lexfile` attribute - Form ids * `wn.Synset.ili` now returns an `ILI` object * `wn.remove()` now takes a `progess_handler` parameter * `wn.util.ProgressBar` uses a simpler formatting string with two new computed variables * `wn.project.is_package_directory()` and `wn.project.is_collection_directory()` now detect packages/collection with ILI resource files ([#23]) * `wn.project.iterpackages()` now includes ILI packages * `wn.Wordnet` now sets the default `expand` value to a lexicon's dependencies if they are specified (related to [#92]) ### Schema * General changes: - Parts of speech are stored as text - Added indexes and `ON DELETE` actions to speed up `wn.remove()` - All extendable tables are now linked to their lexicon ([#91]) - Added rowid to tables with metadata - Preemptively added a `modified` column to `lexicons` table ([#17]) - Preemptively added a `normalized_form` column to `forms` ([#105]) - Relation type tables are combined for synsets and senses ([#75]) * ILI-related changes ([#23]): - ILIs now have an integer rowid and a status - Proposed ILIs also have an integer rowid for metadata access - Added a table for ILI statuses * WN-LMF 1.0 changes ([#65]): - SyntacticBehaviour (previously unused) no longer requires an ID and does not use it in the primary key - Added table for adjposition values - Added source-sense to definitions table * WN-LMF 1.1 changes ([#7], [#89]): - Added a table for lexicon dependencies - Added a table for lexicon extensions ([#99]) - Added `logo` column to `lexicons` table - Added a `synset_rank` column to `senses` table - Added a `pronunciations` table - Added column for lexicographer files to the `synsets` table - Added a table for lexicographer file names - Added an `id` column to `forms` table ## [v0.5.1] **Release date: 2021-01-29** ### Fixed * `wn.lmf` specifies `utf-8` when opening files ([#95]) * `wn.lmf.dump()` casts attribute values to strings ## [v0.5.0] **Release date: 2021-01-28** ### Added * `wn.Lexicon.specifier()` * `wn.config.allow_multithreading` ([#86]) * `wn.util` module for public-API utilities * `wn.util.ProgressHandler` ([#87]) * `wn.util.ProgressBar` ([#87]) ### Removed * `wn.Wordnet.lang` ### Changed * `wn.Synset.get_related()` does same-lexicon traversals first, then ILI expansions ([#90]) * `wn.Synset.get_related()` only targets the source synset lexicon in default mode ([#90], [#92]) * `wn.Wordnet` has a "default mode", when no lexicon or language is selected, which searches any lexicon but relation traversals only target the lexicon of the source synset ([#92]) is used for the lexicon id ([#92]) * `wn.Wordnet` has an empty expand set when a lexicon or language is specified and no expand set is specified ([#92]) * `wn.Wordnet` now allows versions in lexicon specifiers when the id is `*` (e.g., `*:1.3+omw`) * `wn.Wordnet` class signature has `lexicon` first, `lang` is keyword-only ([#93]) * `lang` and `lexicon` parameters are keyword-only on `wn.lexicons()`, `wn.word()`, `wn.words()`, `wn.sense()`, `wn.senses()`, `wn.synset()`, `wn.synsets()`, and the `translate()` methods of `wn.Word`, `wn.Sense`, and `wn.Synset` ([#93]) ## [v0.4.1] **Release date: 2021-01-19** ### Removed * `wn.config.database_filename` (only `wn.config.data_directory` is configurable now) ### Changed * Schema validation is now done when creating a new connection, instead of on import of `wn` * One connection is shared per database path, rather than storing connections on the modeling classes ([#81]) ### Fixed * More robustly check for LMF validity ([#83]) ## [v0.4.0] **Release date: 2020-12-29** ### Added * `wn.export()` to export lexicon(s) from the database ([#15]) * `wn.lmf.dump()` to dump WN-LMF lexicons to disk ([#15]) * `metadata` method on `wn.Word`, `wn.Sense`, and `wn.Synset` * `lexicalized` method on `wn.Sense` and `wn.Synset` * `wn.Form` class ([#79]) * `--verbose` / `-v` option for the command-line interface ([#71]) ### Changed * `wn.Lexicon.metadata` is now a method * `wn.Word.lemma()` returns a `wn.Form` object ([#79]) * `wn.Word.forms()` returns a list of `wn.Form` objects ([#79]) * `wn.project.iterpackages()` raises `wn.Error` on decompression problems ([#77]) * `wn.lmf.LMFError` now inherits from `wn.Error` * `wn.lmf.scan_lexicons()` raises `LMFError` on XML parsing errors ([#77]) * `wn.download()` reraises caught `wn.Error` with more informative message ([#77]) * `wn.add()` improve error message when lexicons are already added ([#77]) * Basic logging added for `wn.download()` and `wn.add()` ([#71]) * `Synset.get_related()` and `Sense.get_related()` may take a `'*'` parameter to get all relations * `wn.Wordnet` objects keep an open connection to the database ([#81]) ### Fixed * `wn.projects.iterpackages()` tries harder to prevent potential race conditions when reading temporary files ([#76]) * `wn.Lexicon.metadata` now returns a dictionary ([#78]) ## [v0.3.0] **Release date: 2020-12-16** ### Added * `add` parameter to `wn.download()` ([#73]) * `--no-add` option to `wn download` command ([#73]) * `progress_handler` parameter to `wn.download()` ([#70]) * `progress_handler` parameter to `wn.add()` ([#70]) ### Fixed * `Synset.shortest_path()` no longer includes starting node ([#63]) * `Synset.closure()`/`Sense.closure()` may take multiple relations ([#74]) * `Synset.hypernym_paths(simulate_root=True)` returns just the fake root node if no paths were found (related to [#64]) * `wn.lexicons()` returns empty list on unknown lang/lexicon ([#59]) ### Changed * Renamed `lgcode` parameter to `lang` throughout ([#66]) * Renamed `Wordnet.lgcode` property to `Wordnet.lang` ([#66]) * Renamed `--lgcode` command-line option to `--lang` ([#66]) * Use better-performing/less-safe database options when adding lexicons ([#69]) ## [v0.2.0] **Release date: 2020-12-02** ### Added * `wn.config.get_cache_path()` returns the path of a cached resource * `wn.projects()` returns the info about known projects ([#60]) * `projects` subcommand to command-line interface ([#60]) * Open German WordNet 1.3 to the index ### Changed * On import, Wn now raises an error if the database has an outdated schema ([#61]) * `wn.config.get_project_info()` now includes a `cache` key * Output of `lexicons` CLI subcommand now tab-delimited ## [v0.1.1] **Release date: 2020-11-26** ### Added * Command-line interface for downloading and listing lexicons ([#47]) ### Fixed * Cast `pathlib.Path` to `str` for `sqlite3.connect()` ([#58]) * Pass `lgcode` to `Wordnet` object in `wn.synset()` ## [v0.1.0] **Release date: 2020-11-25** This is the initial release of the new Wn library. On PyPI it replaces the https://github.com/nltk/wordnet/ code which had been effectively abandoned, but this is an entirely new codebase. [v0.14.0]: ../../releases/tag/v0.14.0 [v0.13.0]: ../../releases/tag/v0.13.0 [v0.12.0]: ../../releases/tag/v0.12.0 [v0.11.0]: ../../releases/tag/v0.11.0 [v0.10.1]: ../../releases/tag/v0.10.1 [v0.10.0]: ../../releases/tag/v0.10.0 [v0.9.5]: ../../releases/tag/v0.9.5 [v0.9.4]: ../../releases/tag/v0.9.4 [v0.9.3]: ../../releases/tag/v0.9.3 [v0.9.2]: ../../releases/tag/v0.9.2 [v0.9.1]: ../../releases/tag/v0.9.1 [v0.9.0]: ../../releases/tag/v0.9.0 [v0.8.3]: ../../releases/tag/v0.8.3 [v0.8.2]: ../../releases/tag/v0.8.2 [v0.8.1]: ../../releases/tag/v0.8.1 [v0.8.0]: ../../releases/tag/v0.8.0 [v0.7.0]: ../../releases/tag/v0.7.0 [v0.6.2]: ../../releases/tag/v0.6.2 [v0.6.1]: ../../releases/tag/v0.6.1 [v0.6.0]: ../../releases/tag/v0.6.0 [v0.5.1]: ../../releases/tag/v0.5.1 [v0.5.0]: ../../releases/tag/v0.5.0 [v0.4.1]: ../../releases/tag/v0.4.1 [v0.4.0]: ../../releases/tag/v0.4.0 [v0.3.0]: ../../releases/tag/v0.3.0 [v0.2.0]: ../../releases/tag/v0.2.0 [v0.1.1]: ../../releases/tag/v0.1.1 [v0.1.0]: ../../releases/tag/v0.1.0 [unreleased]: ../../tree/main [#7]: https://github.com/goodmami/wn/issues/7 [#8]: https://github.com/goodmami/wn/issues/8 [#15]: https://github.com/goodmami/wn/issues/15 [#17]: https://github.com/goodmami/wn/issues/17 [#19]: https://github.com/goodmami/wn/issues/19 [#23]: https://github.com/goodmami/wn/issues/23 [#40]: https://github.com/goodmami/wn/issues/40 [#46]: https://github.com/goodmami/wn/issues/46 [#47]: https://github.com/goodmami/wn/issues/47 [#53]: https://github.com/goodmami/wn/issues/53 [#55]: https://github.com/goodmami/wn/issues/55 [#58]: https://github.com/goodmami/wn/issues/58 [#59]: https://github.com/goodmami/wn/issues/59 [#60]: https://github.com/goodmami/wn/issues/60 [#61]: https://github.com/goodmami/wn/issues/61 [#63]: https://github.com/goodmami/wn/issues/63 [#64]: https://github.com/goodmami/wn/issues/64 [#65]: https://github.com/goodmami/wn/issues/65 [#66]: https://github.com/goodmami/wn/issues/66 [#69]: https://github.com/goodmami/wn/issues/69 [#70]: https://github.com/goodmami/wn/issues/70 [#71]: https://github.com/goodmami/wn/issues/71 [#73]: https://github.com/goodmami/wn/issues/73 [#74]: https://github.com/goodmami/wn/issues/74 [#75]: https://github.com/goodmami/wn/issues/75 [#76]: https://github.com/goodmami/wn/issues/76 [#77]: https://github.com/goodmami/wn/issues/77 [#78]: https://github.com/goodmami/wn/issues/78 [#79]: https://github.com/goodmami/wn/issues/79 [#80]: https://github.com/goodmami/wn/issues/80 [#81]: https://github.com/goodmami/wn/issues/81 [#82]: https://github.com/goodmami/wn/issues/82 [#83]: https://github.com/goodmami/wn/issues/83 [#86]: https://github.com/goodmami/wn/issues/86 [#87]: https://github.com/goodmami/wn/issues/87 [#89]: https://github.com/goodmami/wn/issues/89 [#90]: https://github.com/goodmami/wn/issues/90 [#91]: https://github.com/goodmami/wn/issues/91 [#92]: https://github.com/goodmami/wn/issues/92 [#93]: https://github.com/goodmami/wn/issues/93 [#95]: https://github.com/goodmami/wn/issues/95 [#97]: https://github.com/goodmami/wn/issues/97 [#98]: https://github.com/goodmami/wn/issues/98 [#99]: https://github.com/goodmami/wn/issues/99 [#103]: https://github.com/goodmami/wn/issues/103 [#104]: https://github.com/goodmami/wn/issues/104 [#105]: https://github.com/goodmami/wn/issues/105 [#106]: https://github.com/goodmami/wn/issues/106 [#108]: https://github.com/goodmami/wn/issues/108 [#113]: https://github.com/goodmami/wn/issues/113 [#115]: https://github.com/goodmami/wn/issues/115 [#116]: https://github.com/goodmami/wn/issues/116 [#117]: https://github.com/goodmami/wn/issues/117 [#119]: https://github.com/goodmami/wn/issues/119 [#122]: https://github.com/goodmami/wn/issues/122 [#123]: https://github.com/goodmami/wn/issues/123 [#124]: https://github.com/goodmami/wn/issues/124 [#125]: https://github.com/goodmami/wn/issues/125 [#140]: https://github.com/goodmami/wn/issues/140 [#142]: https://github.com/goodmami/wn/issues/142 [#143]: https://github.com/goodmami/wn/issues/143 [#144]: https://github.com/goodmami/wn/issues/144 [#146]: https://github.com/goodmami/wn/issues/146 [#147]: https://github.com/goodmami/wn/issues/147 [#148]: https://github.com/goodmami/wn/issues/148 [#151]: https://github.com/goodmami/wn/issues/151 [#152]: https://github.com/goodmami/wn/issues/152 [#154]: https://github.com/goodmami/wn/issues/154 [#155]: https://github.com/goodmami/wn/issues/155 [#156]: https://github.com/goodmami/wn/issues/156 [#157]: https://github.com/goodmami/wn/issues/157 [#167]: https://github.com/goodmami/wn/issues/167 [#168]: https://github.com/goodmami/wn/issues/168 [#169]: https://github.com/goodmami/wn/issues/169 [#177]: https://github.com/goodmami/wn/issues/177 [#181]: https://github.com/goodmami/wn/issues/181 [#191]: https://github.com/goodmami/wn/issues/191 [#194]: https://github.com/goodmami/wn/issues/194 [#200]: https://github.com/goodmami/wn/issues/200 [#201]: https://github.com/goodmami/wn/issues/201 [#202]: https://github.com/goodmami/wn/issues/202 [#203]: https://github.com/goodmami/wn/issues/203 [#207]: https://github.com/goodmami/wn/issues/207 [#211]: https://github.com/goodmami/wn/issues/211 [#213]: https://github.com/goodmami/wn/issues/213 [#214]: https://github.com/goodmami/wn/issues/214 [#215]: https://github.com/goodmami/wn/issues/215 [#216]: https://github.com/goodmami/wn/issues/216 [#221]: https://github.com/goodmami/wn/issues/221 [#226]: https://github.com/goodmami/wn/issues/226 [#228]: https://github.com/goodmami/wn/issues/228 [#233]: https://github.com/goodmami/wn/issues/233 [#234]: https://github.com/goodmami/wn/issues/234 [#235]: https://github.com/goodmami/wn/issues/235 [#238]: https://github.com/goodmami/wn/issues/238 [#241]: https://github.com/goodmami/wn/issues/241 [#246]: https://github.com/goodmami/wn/issues/246 [#248]: https://github.com/goodmami/wn/issues/248 [#250]: https://github.com/goodmami/wn/issues/250 [#255]: https://github.com/goodmami/wn/issues/255 [#260]: https://github.com/goodmami/wn/issues/260 [#263]: https://github.com/goodmami/wn/issues/263 [#266]: https://github.com/goodmami/wn/issues/266 [#268]: https://github.com/goodmami/wn/pull/268 [#271]: https://github.com/goodmami/wn/issues/271 [#277]: https://github.com/goodmami/wn/issues/277 [#285]: https://github.com/goodmami/wn/issues/285 [#286]: https://github.com/goodmami/wn/issues/286 [#291]: https://github.com/goodmami/wn/issues/291 [#292]: https://github.com/goodmami/wn/issues/292 [#294]: https://github.com/goodmami/wn/issues/294 [#295]: https://github.com/goodmami/wn/issues/295 [#300]: https://github.com/goodmami/wn/issues/300 [#301]: https://github.com/goodmami/wn/issues/301 [#302]: https://github.com/goodmami/wn/issues/302 [#303]: https://github.com/goodmami/wn/issues/303 wn-1.0.0/CITATION.cff000066400000000000000000000022641513755206300140410ustar00rootroot00000000000000cff-version: 1.2.0 title: Wn message: >- Please cite this software using the metadata from 'preferred-citation'. type: software authors: - given-names: Michael Wayne family-names: Goodman email: goodman.m.w@gmail.com orcid: 'https://orcid.org/0000-0002-2896-5141' - given-names: Francis family-names: Bond email: bond@ieee.org orcid: 'https://orcid.org/0000-0003-4973-8068' repository-code: 'https://github.com/goodmami/wn/' preferred-citation: type: conference-paper authors: - given-names: Michael Wayne family-names: Goodman email: goodmami@uw.edu orcid: 'https://orcid.org/0000-0002-2896-5141' affiliation: Nanyang Technological University - given-names: Francis family-names: Bond email: bond@ieee.org orcid: 'https://orcid.org/0000-0003-4973-8068' affiliation: Nanyang Technological University start: 100 # First page number end: 107 # Last page number conference: name: "Proceedings of the 11th Global Wordnet Conference" title: "Intrinsically Interlingual: The Wn Python Library for Wordnets" year: 2021 month: 1 url: 'https://aclanthology.org/2021.gwc-1.12/' publisher: "Global Wordnet Association" wn-1.0.0/CONTRIBUTING.md000066400000000000000000000062071513755206300144010ustar00rootroot00000000000000# Contributing to Wn Thanks for helping to make Wn better! **Quick Links:** - [Report a bug or request a features](https://github.com/goodmami/wn/issues/new) - [Ask a question](https://github.com/goodmami/wn/discussions) - [View documentation](https://wn.readthedocs.io/) **Developer Information:** - Versioning scheme: [Semantic Versioning](https://semver.org/) - Branching scheme: [GitHub Flow](https://guides.github.com/introduction/flow/) - Changelog: [keep a changelog](https://keepachangelog.com/en/1.0.0/) - Documentation framework: [Sphinx](https://www.sphinx-doc.org/) - Docstring style: [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) (via [sphinx.ext.napoleon](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html)) - Unit/regression testing: [pytest](https://pytest.org/) - Benchmarking: [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) - Packaging framework: [Hatch](https://hatch.pypa.io/) - Coding style: [PEP-8](https://www.python.org/dev/peps/pep-0008/) (via [Ruff](https://beta.ruff.rs/docs/)) - Type checking: [Mypy](http://mypy-lang.org/) ## Get Help Confused about wordnets in general? See the [Global Wordnet Association Documentation](https://globalwordnet.github.io/gwadoc/) Confused about using Wn or wish to share some tips? [Start a discussion](https://github.com/goodmami/wn/discussions) Encountering a problem with Wn or wish to propose a new features? [Raise an issue](https://github.com/goodmami/wn/issues/new) ## Report a Bug When reporting a bug, please provide enough information for someone to reproduce the problem. This might include the version of Python you're running, the version of Wn you have installed, the wordnet lexicons you have installed, and possibly the platform (Linux, Windows, macOS) you're on. Please give a minimal working example that illustrates the problem. For example: > I'm using Wn 0.9.5 with Python 3.11 on Linux and [description of > problem...]. Here's what I have tried: > > ```pycon > >>> import wn > >>> # some code > ... # some result or error > ``` ## Request a Feature If there's a feature that you think would make a good addition to Wn, raise an issue describing what the feature is and what problems it would address. ## Guidelines for Contributing See the "developer information" above for a brief description of guidelines and conventions used in Wn. If you have a fix, please submit a pull request to the `main` branch. In general, every pull request should have an associated issue. Developers should run and test Wn locally from source using [Hatch](https://hatch.pypa.io/). Hatch may be installed system-wide or within a virtual environment: ```bash $ pip install hatch ``` You can then use the `hatch` commands like the following: ```console $ hatch shell # activate a Wn virtual environment $ hatch fmt --check # lint the code and check code style $ hatch run mypy:check # type check with mypy $ hatch test # run unit tests $ hatch test bench # run benchmarks $ hatch build # build a source distribution and wheel $ hatch publish # publish build artifacts to PyPI ``` wn-1.0.0/LICENSE000066400000000000000000000020661513755206300131540ustar00rootroot00000000000000MIT License Copyright (c) 2020 Michael Wayne Goodman Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. wn-1.0.0/README.md000066400000000000000000000346641513755206300134370ustar00rootroot00000000000000

Wn logo
a Python library for wordnets
PyPI link Python Support tests Documentation Status
Available Wordnets | Documentation | FAQ | Migrating from NLTK | Citation

--- Wn is a Python library for exploring information in wordnets. ## Installation Install it from PyPI using **pip**: ```sh pip install wn ``` or **uv**: ``` uv add wn ``` > [!IMPORTANT] > Existing users of Wn may encounter an error about an incompatible database schema. > The remedy is to rebuild the database. There is a new function to help with this: > ```pycon > >>> wn.reset_database(rebuild=True) # re-add any indexed lexicons > ``` > or > ```pycon > >>> wn.reset_database() # initialize without re-adding; start from scratch > ``` ## Getting Started First, download some data: ```sh python -m wn download oewn:2025+ # the Open English WordNet 2025+ ``` Now start exploring: ```python >>> import wn >>> en = wn.Wordnet('oewn:2025+') # Create Wordnet object to query >>> ss = en.synsets('win', pos='v')[0] # Get the first synset for 'win' >>> ss.definition() # Get the synset's definition 'be the winner in a contest or competition; be victorious' ``` ## Features - Multilingual by design; first-class support for wordnets in any language - Interlingual queries via the [Collaborative Interlingual Index](https://github.com/globalwordnet/cili/) - Six [similarity metrics](https://wn.readthedocs.io/en/latest/api/wn.similarity.html) - Functions for [exploring taxonomies](https://wn.readthedocs.io/en/latest/api/wn.taxonomy.html) - Support for [lemmatization] ([Morphy] for English is built-in) and unicode [normalization] - Full support of the [WN-LMF 1.4](https://globalwordnet.github.io/schemas/) format, including word pronunciations and lexicon extensions - SQL-based backend offers very fast startup and improved performance on many kinds of queries [lemmatization]: https://wn.readthedocs.io/en/latest/guides/lemmatization.html#lemmatization [normalization]: https://wn.readthedocs.io/en/latest/guides/lemmatization.html#normalization [Morphy]: https://wn.readthedocs.io/en/latest/api/wn.morphy.html ## Available Wordnets Any WN-LMF-formatted wordnet can be added to Wn's database from a local file or remote URL, but Wn also maintains an index (see [wn/index.toml](https://github.com/goodmami/wn/blob/main/wn/index.toml)) of available projects, similar to a package manager for software, to aid in the discovery and downloading of new wordnets. The projects in this index are listed below. ### English Wordnets There are several English wordnets available. In general it is recommended to use the latest [Open English Wordnet], but if you have stricter compatibility needs for, e.g., experiment replicability, you may try the [OMW English Wordnet based on WordNet 3.0] (compatible with the Princeton WordNet 3.0 and with the [NLTK]), or [OpenWordnet-EN] (for use with the Portuguese wordnet [OpenWordnet-PT]). | Name | Specifier | # Synsets | Notes | | -------------------------------------------- | ---------------------- | --------: | ----- | | [Open English WordNet] | `oewn:2025+`
`oewn:2025`
`oewn:2024`
`oewn:2023`
`oewn:2022`
`oewn:2021`
`ewn:2020`
`ewn:2019` | 120564
107519
120630
120135
120068
120039
120053
117791 | ← Recommended
 
 
 
 
 
 
  | | [OMW English Wordnet based on WordNet 1.5] | `omw-en15:2.0` | 91591 | | | [OMW English Wordnet based on WordNet 1.6] | `omw-en16:2.0` | 99642 | | | [OMW English Wordnet based on WordNet 1.7] | `omw-en17:2.0` | 109377 | | | [OMW English Wordnet based on WordNet 1.7.1] | `omw-en171:2.0` | 111223 | | | [OMW English Wordnet based on WordNet 2.0] | `omw-en20:2.0` | 115424 | | | [OMW English Wordnet based on WordNet 2.1] | `omw-en21:2.0` | 117597 | | | [OMW English Wordnet based on WordNet 3.0] | `omw-en:2.0`
`omw-en:1.4` | 117659
117659 | Included with `omw:2.0`
Included with `omw:1.4` | | [OMW English Wordnet based on WordNet 3.1] | `omw-en31:2.0`
`omw-en31:1.4` | 117791
117791 | | | [OpenWordnet-EN] | `own-en:1.0.0` | 117659 | Included with `own:1.0.0` | [Open English WordNet]: https://en-word.net [Open Multilingual Wordnet]: https://github.com/omwn [OMW English Wordnet based on WordNet 1.5]: https://github.com/omwn/omw-data [OMW English Wordnet based on WordNet 1.6]: https://github.com/omwn/omw-data [OMW English Wordnet based on WordNet 1.7]: https://github.com/omwn/omw-data [OMW English Wordnet based on WordNet 1.7.1]: https://github.com/omwn/omw-data [OMW English Wordnet based on WordNet 2.0]: https://github.com/omwn/omw-data [OMW English Wordnet based on WordNet 2.1]: https://github.com/omwn/omw-data [OMW English Wordnet based on WordNet 3.0]: https://github.com/omwn/omw-data [OMW English Wordnet based on WordNet 3.1]: https://github.com/omwn/omw-data [OpenWordnet-EN]: https://github.com/own-pt/openWordnet-PT [OpenWordnet-PT]: https://github.com/own-pt/openWordnet-PT [NLTK]: https://www.nltk.org/ ### Other Wordnets and Collections These are standalone non-English wordnets and collections. The wordnets of each collection are listed further down. | Name | Specifier | # Synsets | Language | | ------------------------------------------ | ----------------------------- | --------------: | ---------------- | | [Open Multilingual Wordnet] | `omw:1.4` | n/a | multiple [[mul]] | | [Open German WordNet] | `odenet:1.4`
`odenet:1.3` | 36268
36159 | German [de] | | [Open Wordnets for Portuguese and English] | `own:1.0.0` | n/a | multiple [[mul]] | | [KurdNet] | `kurdnet:1.0` | 2144 | Kurdish [ckb] | [Open English WordNet]: https://github.com/globalwordnet/english-wordnet [Open Multilingual Wordnet]: https://github.com/omwn [OMW English Wordnet based on WordNet 3.0]: https://github.com/omwn [OMW English Wordnet based on WordNet 3.1]: https://github.com/omwn [Open German WordNet]: https://github.com/hdaSprachtechnologie/odenet [Open Wordnets for Portuguese and English]: https://github.com/own-pt [mul]: https://iso639-3.sil.org/code/mul [KurdNet]: https://sinaahmadi.github.io/resources/kurdnet.html ### Open Multilingual Wordnet (OMW) Collection The *Open Multilingual Wordnet* collection (`omw:1.4`) installs the following lexicons (from [here](https://github.com/omwn/omw-data/releases/tag/v1.4)) which can also be downloaded and installed independently: | Name | Specifier | # Synsets | Language | | ---------------------------------------- | -------------------------------- | -----------------: | -------------------------------- | | Albanet | `omw-sq:2.0`
`omw-sq:1.4` | 4679
4675 | Albanian [sq] | | Arabic WordNet (AWN v2) | `omw-arb:2.0`
`omw-arb:1.4` | 9916
9916 | Arabic [arb] | | BulTreeBank Wordnet (BTB-WN) | `omw-bg:2.0`
`omw-bg:1.4` | 4959
4959 | Bulgarian [bg] | | Chinese Open Wordnet | `omw-cmn:2.0`
`omw-cmn:1.4` | 42300
42312 | Mandarin (Simplified) [cmn-Hans] | | Croatian Wordnet | `omw-hr:2.0`
`omw-hr:1.4` | 23115
23120 | Croatian [hr] | | DanNet | `omw-da:2.0`
`omw-da:1.4` | 4476
4476 | Danish [da] | | FinnWordNet | `omw-fi:2.0`
`omw-fi:1.4` | 116763
116763 | Finnish [fi] | | Greek Wordnet | `omw-el:2.0`
`omw-el:1.4` | 18113
18049 | Greek [el] | | Hebrew Wordnet | `omw-he:2.0`
`omw-he:1.4` | 5448
5448 | Hebrew [he] | | IceWordNet | `omw-is:2.0`
`omw-is:1.4` | 4951
4951 | Icelandic [is] | | Italian Wordnet | `omw-iwn:2.0`
`omw-iwn:1.4` | 15563
15563 | Italian [it] | | Japanese Wordnet | `omw-ja:2.0`
`omw-ja:1.4` | 117659
57184 | Japanese [ja] | | Lithuanian WordNet | `omw-lt:2.0`
`omw-lt:1.4` | 9462
9462 | Lithuanian [lt] | | Multilingual Central Repository | `omw-ca:2.0`
`omw-ca:1.4` | 60765
45826 | Catalan [ca] | | Multilingual Central Repository | `omw-eu:2.0`
`omw-eu:1.4` | 29420
29413 | Basque [eu] | | Multilingual Central Repository | `omw-gl:2.0`
`omw-gl:1.4` | 34776
19312 | Galician [gl] | | Multilingual Central Repository | `omw-es:2.0`
`omw-es:1.4` | 78948
38512 | Spanish [es] | | MultiWordNet | `omw-it:2.0`
`omw-it:1.4` | 35001
35001 | Italian [it] | | Norwegian Wordnet | `omw-nb:2.0`
`omw-nb:1.4` | 4455
4455 | Norwegian (Bokmål) [nb] | | Norwegian Wordnet | `omw-nn:2.0`
`omw-nn:1.4` | 3671
3671 | Norwegian (Nynorsk) [nn] | | OMW English Wordnet based on WordNet 3.0 | `omw-en:2.0`
`omw-en:1.4` | 117659
117659 | English [en] | | Open Dutch WordNet | `omw-nl:2.0`
`omw-nl:1.4` | 30177
30177 | Dutch [nl] | | OpenWN-PT | `omw-pt:2.0`
`omw-pt:1.4` | 43895
43895 | Portuguese [pt] | | plWordNet | `omw-pl:2.0`
`omw-pl:1.4` | 33826
33826 | Polish [pl] | | Romanian Wordnet | `omw-ro:2.0`
`omw-ro:1.4` | 58754
56026 | Romanian [ro] | | Slovak WordNet | `omw-sk:2.0`
`omw-sk:1.4` | 18507
18507 | Slovak [sk] | | sloWNet | `omw-sl:2.0`
`omw-sl:1.4` | 42590
42583 | Slovenian [sl] | | Swedish (SALDO) | `omw-sv:2.0`
`omw-sv:1.4` | 6796
6796 | Swedish [sv] | | Thai Wordnet | `omw-th:2.0`
`omw-th:1.4` | 73350
73350 | Thai [th] | | WOLF (Wordnet Libre du Français) | `omw-fr:2.0`
`omw-fr:1.4` | 59091
59091 | French [fr] | | Wordnet Bahasa | `omw-id:2.0`
`omw-id:1.4` | 46774
38085 | Indonesian [id] | | Wordnet Bahasa | `omw-zsm:2.0`
`omw-zsm:1.4` | 36911
36911 | Malaysian [zsm] | ### Open Wordnet (OWN) Collection The *Open Wordnets for Portuguese and English* collection (`own:1.0.0`) installs the following lexicons (from [here](https://github.com/own-pt/openWordnet-PT/releases/tag/v1.0.0)) which can also be downloaded and installed independently: | Name | Specifier | # Synsets | Language | | -------------- | -------------- | --------: | --------------- | | OpenWordnet-PT | `own-pt:1.0.0` | 52670 | Portuguese [pt] | | OpenWordnet-EN | `own-en:1.0.0` | 117659 | English [en] | ### Collaborative Interlingual Index While not a wordnet, the [Collaborative Interlingual Index] (CILI) represents the interlingual backbone of many wordnets. Wn, including interlingual queries, will function without CILI loaded, but adding it to the database makes available the full list of concepts, their status (active, deprecated, etc.), and their definitions. | Name | Specifier | # Concepts | | ---------------------------------- | ---------- | ---------: | | [Collaborative Interlingual Index] | `cili:1.0` | 117659 | [Collaborative Interlingual Index]: https://github.com/globalwordnet/cili/ ## Changes to the Index ### `ewn` → `oewn` The 2021 version of the *Open English WordNet* (`oewn:2021`) has changed its lexicon ID from `ewn` to `oewn`, so the index is updated accordingly. The previous versions are still available as `ewn:2019` and `ewn:2020`. ### `pwn` → `omw-en`, `omw-en31` The wordnet formerly called the *Princeton WordNet* (`pwn:3.0`, `pwn:3.1`) is now called the *OMW English Wordnet based on WordNet 3.0* (`omw-en`) and the *OMW English Wordnet based on WordNet 3.1* (`omw-en31`). This is more accurate, as it is a OMW-produced derivative of the original WordNet data, and it also avoids license or trademark issues. ### `*wn` → `omw-*` for OMW wordnets All OMW wordnets have changed their ID scheme from `...wn` to `omw-..` and the version no longer includes `+omw` (e.g., `bulwn:1.3+omw` is now `omw-bg:1.4`). ## Citation Michael Wayne Goodman and Francis Bond. 2021. [Intrinsically Interlingual: The Wn Python Library for Wordnets](https://aclanthology.org/2021.gwc-1.12/) In *Proceedings of the 11th Global Wordnet Conference*, pages 100–107, University of South Africa (UNISA). Global Wordnet Association. wn-1.0.0/bench/000077500000000000000000000000001513755206300132225ustar00rootroot00000000000000wn-1.0.0/bench/README.md000066400000000000000000000022241513755206300145010ustar00rootroot00000000000000# Wn Benchmarking This directory contains code and data for running benchmarks for Wn. The benchmarks are implemented using [pytest-benchmarks](https://github.com/ionelmc/pytest-benchmark/), so they are run using pytest as follows (from the top-level project directory): ```console $ hatch test bench/ # run the benchmarks $ hatch test bench/ --benchmark-autosave # run benchmarks and store results $ hatch test bench/ --benchmark-compare # run benchmarks and compare to stored result $ hatch test -- --help # get help on options (look for those prefixed `--benchmark-`) ``` Notes: * The tests are not exhaustive; when making a change that may affect performance, consider making a new test if one doesn't exist already. It would be helpful to check in the test to Git, but not the benchmark results since those are dependent on the machine. * Benchmark the code before and after the changes. Store the results locally for comparison. * Ensure the testing environment has a steady load (wait for long-running processes to finish, close any active web browser tabs, etc.) prior to and while running the test. * Expect high variance for IO-bound tasks. wn-1.0.0/bench/conftest.py000066400000000000000000000105741513755206300154300ustar00rootroot00000000000000from collections.abc import Iterator from itertools import cycle, product from pathlib import Path import pytest import wn from wn import lmf @pytest.fixture def clean_db(): def clean_db(): wn.remove("*") dummy_lex = lmf.Lexicon( id="dummy", version="1", label="placeholder to initialize the db", language="zxx", email="", license="", ) wn.add_lexical_resource( lmf.LexicalResource(lmf_version="1.3", lexicons=[dummy_lex]) ) return clean_db @pytest.fixture(scope="session") def datadir(): return Path(__file__).parent.parent / "tests" / "data" @pytest.fixture def empty_db(clean_db, tmp_path): dir = tmp_path / "wn_data_empty" with pytest.MonkeyPatch.context() as m: m.setattr(wn.config, "data_directory", dir) clean_db() yield @pytest.fixture(scope="session") def mock_lmf(): synsets: list[lmf.Synset] = [ *_make_synsets("n", 20000), *_make_synsets("v", 10000), *_make_synsets("a", 2000), *_make_synsets("r", 1000), ] entries = _make_entries(synsets) lexicon = lmf.Lexicon( id="mock", version="1", label="", language="zxx", email="", license="", entries=entries, synsets=synsets, ) return lmf.LexicalResource(lmf_version="1.3", lexicons=[lexicon]) @pytest.fixture(scope="session") def mock_db_dir(mock_lmf, tmp_path_factory): dir = tmp_path_factory.mktemp("wn_data_empty") with pytest.MonkeyPatch.context() as m: m.setattr(wn.config, "data_directory", dir) wn.add_lexical_resource(mock_lmf, progress_handler=None) wn._db.clear_connections() return Path(dir) @pytest.fixture def mock_db(monkeypatch, mock_db_dir): with monkeypatch.context() as m: m.setattr(wn.config, "data_directory", mock_db_dir) yield wn._db.clear_connections() def _make_synsets(pos: str, n: int) -> list[lmf.Synset]: synsets: list[lmf.Synset] = [ lmf.Synset( id=f"{i}-{pos}", ili="", partOfSpeech=pos, relations=[], meta={}, ) for i in range(1, n + 1) ] # add relations for nouns and verbs if pos in "nv": total = len(synsets) tgt_i = 1 # index of next target synset n = cycle([2]) # how many targets to relate for cur_i in range(total): if tgt_i <= cur_i: tgt_i = cur_i + 1 source = synsets[cur_i] for cur_k in range(tgt_i, tgt_i + next(n)): if cur_k >= total: break target = synsets[cur_k] source["relations"].append( lmf.Relation(target=target["id"], relType="hyponym", meta={}) ) target["relations"].append( lmf.Relation(target=source["id"], relType="hypernym", meta={}) ) tgt_i = cur_k + 1 return synsets def _words() -> Iterator[str]: consonants = "kgtdpbfvszrlmnhw" vowels = "aeiou" while True: yield from map("".join, product(consonants, vowels, consonants, vowels)) def _make_entries(synsets: list[lmf.Synset]) -> list[lmf.LexicalEntry]: words = _words() member_count = cycle(range(1, 4)) # 1, 2, or 3 synset members entries: dict[str, lmf.LexicalEntry] = {} prev_synsets: list[lmf.Synset] = [] for synset in synsets: ssid = synset["id"] pos = synset["partOfSpeech"] for _ in range(next(member_count)): word = next(words) senses = [lmf.Sense(id=f"{word}-{ssid}", synset=ssid, meta={})] # add some polysemy if prev_synsets: ssid2 = prev_synsets.pop()["id"] senses.append(lmf.Sense(id=f"{word}-{ssid2}", synset=ssid2, meta={})) eid = f"{word}-{pos}" if eid not in entries: entries[eid] = lmf.LexicalEntry( id=eid, lemma=lmf.Lemma( writtenForm=word, partOfSpeech=pos, ), senses=[], meta={}, ) entries[eid]["senses"].extend(senses) prev_synsets.append(synset) return list(entries.values()) wn-1.0.0/bench/test_bench.py000066400000000000000000000033061513755206300157140ustar00rootroot00000000000000import pytest import wn from wn import lmf @pytest.mark.benchmark(group="lmf.load", warmup=True) def test_load(datadir, benchmark): benchmark(lmf.load, datadir / "mini-lmf-1.0.xml") @pytest.mark.benchmark(group="wn.add_lexical_resource") @pytest.mark.usefixtures("empty_db") def test_add_lexical_resource(mock_lmf, benchmark): # TODO: when pytest-benchmark's teardown option is released, use # that here with more rounds benchmark.pedantic( wn.add_lexical_resource, args=(mock_lmf,), # teardown=clean_db, iterations=1, rounds=1, ) @pytest.mark.benchmark(group="wn.add_lexical_resource") @pytest.mark.usefixtures("empty_db") def test_add_lexical_resource_no_progress(mock_lmf, benchmark): # TODO: when pytest-benchmark's teardown option is released, use # that here with more rounds benchmark.pedantic( wn.add_lexical_resource, args=(mock_lmf,), kwargs={"progress_handler": None}, # teardown=clean_db, iterations=1, rounds=1, ) @pytest.mark.benchmark(group="primary queries") @pytest.mark.usefixtures("mock_db") def test_synsets(benchmark): benchmark(wn.synsets) @pytest.mark.benchmark(group="primary queries") @pytest.mark.usefixtures("mock_db") def test_words(benchmark): benchmark(wn.words) @pytest.mark.benchmark(group="secondary queries") @pytest.mark.usefixtures("mock_db") def test_word_senses_no_wordnet(benchmark): word = wn.words()[0] benchmark(word.senses) @pytest.mark.benchmark(group="secondary queries") @pytest.mark.usefixtures("mock_db") def test_word_senses_with_wordnet(benchmark): w = wn.Wordnet("mock:1") word = w.words()[0] benchmark(word.senses) wn-1.0.0/docs/000077500000000000000000000000001513755206300130735ustar00rootroot00000000000000wn-1.0.0/docs/.readthedocs.yaml000066400000000000000000000011301513755206300163150ustar00rootroot00000000000000# .readthedocs.yaml # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Set the version of Python and other tools you might need build: os: ubuntu-22.04 tools: python: "3.12" # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py # We recommend specifying your dependencies to enable reproducible builds: # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html python: install: - requirements: docs/requirements.txt formats: - pdf - epub wn-1.0.0/docs/Makefile000066400000000000000000000011721513755206300145340ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) wn-1.0.0/docs/_static/000077500000000000000000000000001513755206300145215ustar00rootroot00000000000000wn-1.0.0/docs/_static/css/000077500000000000000000000000001513755206300153115ustar00rootroot00000000000000wn-1.0.0/docs/_static/css/svg.css000066400000000000000000000003151513755206300166210ustar00rootroot00000000000000svg { width: 500px; height: 300px; position: relative; left: 20%; -webkit-transform: translateX(-20%); -ms-transform: translateX(-20%); transform: translateX(-20%); } wn-1.0.0/docs/_static/demo.ipynb000066400000000000000000000551121513755206300165140ustar00rootroot00000000000000{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![logo](https://raw.githubusercontent.com/goodmami/wn/main/docs/_static/wn-logo-rotate.svg)\n", "\n", "# Wn Demonstration\n", "\n", "This is a demonstration of the [Wn](https://github.com/goodmami/wn/) library for working with wordnets in Python. To run this notebook locally, you will need to install the `wn` and `jupyter` packages, and download some wordnet data:\n", "\n", "* Linux/macOS\n", "\n", " ```console\n", " $ python3 -m pip install wn jupyter\n", " $ python3 -m wn download omw oewn:2021\n", " ```\n", " \n", "* Windows\n", "\n", " ```console\n", " > py -3 -m pip install wn jupyter\n", " > py -3 -m wn download omw oewn:2021\n", " ```\n", "\n", "Now you should be able to import the `wn` package:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import wn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Primary Queries\n", "\n", "A **primary query** of the database is when basic parameters such as word forms, parts of speech, or public identifiers (e.g., synset IDs) are used to retrieve basic wordnet entities. You can perform these searches via module-level functions such as [wn.words()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.words), [wn.senses()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.senses), and [wn.synsets()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.synsets):" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Word('oewn-Malacca-n')]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wn.words(\"Malacca\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Synset('oewn-08985168-n')]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wn.synsets(\"Malacca\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filtering by Language / Lexicon\n", "\n", "Once you've added multiple wordnets, however, you will often get many results for such queries. If that's not clear, then the following will give you some idea(s):" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Word('omw-en-idea-n'),\n", " Word('omw-sk-idea-n'),\n", " Word('omw-pl-idea-n'),\n", " Word('omw-is-ídea-n'),\n", " Word('omw-zsm-idea-n'),\n", " Word('omw-iwn-idea-n'),\n", " Word('omw-it-idea-n'),\n", " Word('omw-gl-idea-n'),\n", " Word('omw-fi-idea-n'),\n", " Word('omw-ca-idea-n'),\n", " Word('omw-eu-idea-n'),\n", " Word('omw-es-idea-n'),\n", " Word('oewn-idea-n')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wn.words(\"idea\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can filter down the results by language, but that may not be enough if you have multiple wordnets for the same language (e.g., the [OMW English Wordnet based on WordNet 3.0](https://github.com/omwn/omw-data/) and the [Open English WordNet](https://en-word.net/)):" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Word('omw-en-idea-n'), Word('oewn-idea-n')]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wn.words(\"idea\", lang=\"en\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The [wn.lexicons()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.lexicons) function can show which lexicons have been added for a language:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[, ]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wn.lexicons(lang=\"en\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can use the `id:version` string to restrict queries to a particular lexicon:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Word('omw-en-idea-n')]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wn.words(\"idea\", lexicon=\"omw-en:1.4\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "But it can become tedious to enter these specifiers each time. Instead, a [wn.Wordnet](https://wn.readthedocs.io/en/latest/api/wn.html#the-wordnet-class) object can be used to make the language/lexicon filters persistent:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Word('omw-en-idea-n')]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en = wn.Wordnet(lexicon=\"omw-en:1.4\")\n", "en.words(\"idea\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filtering by Word Form and Part of Speech\n", "\n", "Even within a single lexicon a word may return multiple results:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Word('omw-en-pencil-n'), Word('omw-en-pencil-v')]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en.words(\"pencil\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can restrict results by part of speech, as well. E.g., to get the verbal sense of *pencil* (e.g., *to pencil in an appointment*), use the `pos` filter:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Word('omw-en-pencil-v')]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en.words(\"pencil\", pos=\"v\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This works for getting senses and synsets, too:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Sense('omw-en-pencil-03908204-n'),\n", " Sense('omw-en-pencil-14796748-n'),\n", " Sense('omw-en-pencil-13863020-n'),\n", " Sense('omw-en-pencil-03908456-n'),\n", " Sense('omw-en-pencil-01688604-v')]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en.senses(\"pencil\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Sense('omw-en-pencil-01688604-v')]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en.senses(\"pencil\", pos=\"v\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Synset('omw-en-01688604-v')]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en.synsets(\"pencil\", pos=\"v\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The wordform itself is just a filter on the results. Leaving it off, you can get all results for a particular part of speech:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11531" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(en.words(pos=\"v\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Or all results, regardless of the part of speech:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "156584" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(en.words())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Secondary Queries\n", "\n", "**Secondary queries** are used when you want to get additional information from a retrieved entity, such as the forms of a word or the definition of a synset. They are also used for finding links between entities, such as the senses of a word or the relations of a sense or synset." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'pencil'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil = en.words(\"pencil\", pos=\"v\")[0]\n", "pencil.lemma()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['pencil', 'pencilled', 'pencilling']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil.forms()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'v'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil.pos" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Sense('omw-en-pencil-01688604-v')]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil.senses()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Synset('omw-en-01688604-v')" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil.senses()[0].synset()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Synset('omw-en-01688604-v')]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil.synsets() # shorthand for the above" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'write, draw, or trace with a pencil'" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil_ss = pencil.synsets()[0]\n", "pencil_ss.definition()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['he penciled a figure']" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil_ss.examples()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Synset('omw-en-01690294-v')]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil_ss.hypernyms()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['draw']" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil_ss.hypernyms()[0].lemmas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Taxonomy Queries\n", "\n", "A common usage of wordnets is exploring the taxonomic structure via hypernym and hyponym relations. These operations thus have some more dedicated functions. For instance, path functions show the synsets from the starting synset to some other synset or the taxonomic root, such as [Synset.hypernym_paths()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.Synset.hypernym_paths):" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Synset('omw-en-01690294-v') ['draw']\n", " Synset('omw-en-01686132-v') ['represent', 'interpret']\n", " Synset('omw-en-01619354-v') ['re-create']\n", " Synset('omw-en-01617192-v') ['make', 'create']\n" ] } ], "source": [ "for path in pencil_ss.hypernym_paths():\n", " for i, ss in enumerate(path):\n", " print(\" \" * i, ss, ss.lemmas())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Paths do not include the starting synset, so the length of the path (i.e., number of edges) is the length of the list of synsets. The length from a synset to the root is called the *depth*. However, as some synsets have multiple paths to the root, there is not always one single depth. Instead, the [Synset.min_depth()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.Synset.min_depth) and [Synset.max_depth()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.Synset.max_depth) methods find the lengths of the shortest and longest paths." ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dog = en.synsets(\"dog\", pos=\"n\")[0]\n", "len(dog.hypernym_paths()) # two paths" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8, 13)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dog.min_depth(), dog.max_depth()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is also possible to find paths between two synsets by their lowest common hypernym (also called *least common subsumer*). Here I compare the verbs *pencil* and *pen*:" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Synset('omw-en-01697816-v') ['create verbally']\n", " Synset('omw-en-01617192-v') ['make', 'create']\n" ] } ], "source": [ "pen_ss = en.synsets(\"pen\", pos=\"v\")[0]\n", "for path in pen_ss.hypernym_paths():\n", " for i, ss in enumerate(path):\n", " print(\" \" * i, ss, ss.lemmas())" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Synset('omw-en-01617192-v')]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil_ss.lowest_common_hypernyms(pen_ss)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Synset('omw-en-01690294-v') ['draw']\n", "Synset('omw-en-01686132-v') ['represent', 'interpret']\n", "Synset('omw-en-01619354-v') ['re-create']\n", "Synset('omw-en-01617192-v') ['make', 'create']\n", "Synset('omw-en-01697816-v') ['create verbally']\n", "Synset('omw-en-01698271-v') ['write', 'compose', 'pen', 'indite']\n" ] } ], "source": [ "for ss in pencil_ss.shortest_path(pen_ss):\n", " print(ss, ss.lemmas())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Interlingual Queries\n", "\n", "In Wn, each wordnet (lexicon) added to the database is given its own, independent structure. All queries that traverse across wordnets make use of the Interlingual index (ILI) on synsets." ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'a thin cylindrical pointed writing implement; a rod of marking substance encased in wood'" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil_ss = en.synsets(\"pencil\", pos=\"n\")[0] # for this we'll use the nominal sense\n", "pencil_ss.definition()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To get the corresponding words, senses, or synsets in some other lexicon, use the [Word.translate()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.Word.translate), [Sense.translate()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.Sense.translate), and [Synset.translate()](https://wn.readthedocs.io/en/latest/api/wn.html#wn.Synset.translate) functions. Of these, the function on the sense is the most natural, as it translates a specific meaning of a specific word, although all translations go through the synsets. As a word may have many senses, translating a word returns a mapping of each sense to its list of translations." ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['lapis', 'matita']" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil_ss.translate(lang=\"it\")[0].lemmas()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['ペンシル', '木筆', '鉛筆']" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pencil_ss.translate(lexicon=\"omw-ja\")[0].lemmas()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{Sense('omw-en-pencil-03908204-n'): [Word('omw-ja-ペンシル-n'),\n", " Word('omw-ja-木筆-n'),\n", " Word('omw-ja-鉛筆-n')],\n", " Sense('omw-en-pencil-14796748-n'): [Word('omw-ja-鉛筆-n')],\n", " Sense('omw-en-pencil-13863020-n'): [],\n", " Sense('omw-en-pencil-03908456-n'): [Word('omw-ja-ペンシル-n')]}" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en.words(\"pencil\", pos=\"n\")[0].translate(lexicon=\"omw-ja\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Interlingual synsets are also used to traversing relations from another wordnet. For instance, many of the lexicons in the [Open Multilingual Wordnet](https://github.com/omwn/omw-data) were created using the *expand* method where only words were translated on top of Princeton WordNet synsets. All relations (hypernyms, hyponyms, etc.) then depend on those from WordNet. In Wn, a [Wordnet](https://wn.readthedocs.io/en/latest/api/wn.html#the-wordnet-class) object may be instantiated with an `expand` parameter which selects lexicons containing such relations. By default, all lexicons are used (i.e., `expand='*'`), but you can tell Wn to not use any expand lexicons (`expand=''`) or to use a specific lexicon (`expand='omw-en:1.4'`). By being specific, you can better control the behaviour of your program, e.g., for experimental reproducibility." ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Synset('omw-ja-14796575-n')]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# by default, any other installed lexicon may be used\n", "wn.Wordnet(lexicon=\"omw-ja\").synsets(\"鉛筆\")[0].hypernyms()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# disable interlingual query expansion\n", "wn.Wordnet(lexicon=\"omw-ja\", expand=\"\").synsets(\"鉛筆\")[0].hypernyms()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Synset('omw-ja-14796575-n')]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# specify the expand set\n", "wn.Wordnet(lexicon=\"omw-ja\", expand=\"omw-en:1.4\").synsets(\"鉛筆\")[0].hypernyms()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 4 } wn-1.0.0/docs/_static/wn-logo-rotate.svg000066400000000000000000000030421513755206300201170ustar00rootroot00000000000000 wn-1.0.0/docs/_static/wn-logo.svg000066400000000000000000000015471513755206300166330ustar00rootroot00000000000000 wn-1.0.0/docs/api/000077500000000000000000000000001513755206300136445ustar00rootroot00000000000000wn-1.0.0/docs/api/wn.compat.rst000066400000000000000000000010251513755206300163020ustar00rootroot00000000000000wn.compat ========= Compatibility modules for Wn. This subpackage is a namespace for compatibility modules when working with particular lexicons. Wn is designed to be agnostic to the language or lexicon and not favor one over the other (with the exception of :mod:`wn.morphy`, which is English-specific). However, there are some kinds of functionality that would be useful to include in Wn, even if they don't generalize to all lexicons. Included modules ---------------- .. toctree:: :maxdepth: 1 wn.compat.sensekey.rst wn-1.0.0/docs/api/wn.compat.sensekey.rst000066400000000000000000000003011513755206300201230ustar00rootroot00000000000000wn.compat.sensekey ================== .. automodule:: wn.compat.sensekey .. autofunction:: escape .. autofunction:: unescape .. autofunction:: sense_key_getter .. autofunction:: sense_getter wn-1.0.0/docs/api/wn.constants.rst000066400000000000000000000216261513755206300170440ustar00rootroot00000000000000wn.constants ============ .. automodule:: wn.constants Synset Relations ---------------- .. data:: SYNSET_RELATIONS - ``agent`` - ``also`` - ``attribute`` - ``be_in_state`` - ``causes`` - ``classified_by`` - ``classifies`` - ``co_agent_instrument`` - ``co_agent_patient`` - ``co_agent_result`` - ``co_instrument_agent`` - ``co_instrument_patient`` - ``co_instrument_result`` - ``co_patient_agent`` - ``co_patient_instrument`` - ``co_result_agent`` - ``co_result_instrument`` - ``co_role`` - ``direction`` - ``domain_region`` - ``domain_topic`` - ``exemplifies`` - ``entails`` - ``eq_synonym`` - ``has_domain_region`` - ``has_domain_topic`` - ``is_exemplified_by`` - ``holo_location`` - ``holo_member`` - ``holo_part`` - ``holo_portion`` - ``holo_substance`` - ``holonym`` - ``hypernym`` - ``hyponym`` - ``in_manner`` - ``instance_hypernym`` - ``instance_hyponym`` - ``instrument`` - ``involved`` - ``involved_agent`` - ``involved_direction`` - ``involved_instrument`` - ``involved_location`` - ``involved_patient`` - ``involved_result`` - ``involved_source_direction`` - ``involved_target_direction`` - ``is_caused_by`` - ``is_entailed_by`` - ``location`` - ``manner_of`` - ``mero_location`` - ``mero_member`` - ``mero_part`` - ``mero_portion`` - ``mero_substance`` - ``meronym`` - ``similar`` - ``other`` - ``patient`` - ``restricted_by`` - ``restricts`` - ``result`` - ``role`` - ``source_direction`` - ``state_of`` - ``target_direction`` - ``subevent`` - ``is_subevent_of`` - ``antonym`` - ``feminine`` - ``has_feminine`` - ``masculine`` - ``has_masculine`` - ``young`` - ``has_young`` - ``diminutive`` - ``has_diminutive`` - ``augmentative`` - ``has_augmentative`` - ``anto_gradable`` - ``anto_simple`` - ``anto_converse`` - ``ir_synonym`` Sense Relations --------------- .. data:: SENSE_RELATIONS - ``antonym`` - ``also`` - ``participle`` - ``pertainym`` - ``derivation`` - ``domain_topic`` - ``has_domain_topic`` - ``domain_region`` - ``has_domain_region`` - ``exemplifies`` - ``is_exemplified_by`` - ``similar`` - ``other`` - ``feminine`` - ``has_feminine`` - ``masculine`` - ``has_masculine`` - ``young`` - ``has_young`` - ``diminutive`` - ``has_diminutive`` - ``augmentative`` - ``has_augmentative`` - ``anto_gradable`` - ``anto_simple`` - ``anto_converse`` - ``simple_aspect_ip`` - ``secondary_aspect_ip`` - ``simple_aspect_pi`` - ``secondary_aspect_pi`` .. data:: SENSE_SYNSET_RELATIONS - ``domain_topic`` - ``domain_region`` - ``exemplifies`` - ``other`` .. data:: REVERSE_RELATIONS .. code-block:: python { 'hypernym': 'hyponym', 'hyponym': 'hypernym', 'instance_hypernym': 'instance_hyponym', 'instance_hyponym': 'instance_hypernym', 'antonym': 'antonym', 'eq_synonym': 'eq_synonym', 'similar': 'similar', 'meronym': 'holonym', 'holonym': 'meronym', 'mero_location': 'holo_location', 'holo_location': 'mero_location', 'mero_member': 'holo_member', 'holo_member': 'mero_member', 'mero_part': 'holo_part', 'holo_part': 'mero_part', 'mero_portion': 'holo_portion', 'holo_portion': 'mero_portion', 'mero_substance': 'holo_substance', 'holo_substance': 'mero_substance', 'also': 'also', 'state_of': 'be_in_state', 'be_in_state': 'state_of', 'causes': 'is_caused_by', 'is_caused_by': 'causes', 'subevent': 'is_subevent_of', 'is_subevent_of': 'subevent', 'manner_of': 'in_manner', 'in_manner': 'manner_of', 'attribute': 'attribute', 'restricts': 'restricted_by', 'restricted_by': 'restricts', 'classifies': 'classified_by', 'classified_by': 'classifies', 'entails': 'is_entailed_by', 'is_entailed_by': 'entails', 'domain_topic': 'has_domain_topic', 'has_domain_topic': 'domain_topic', 'domain_region': 'has_domain_region', 'has_domain_region': 'domain_region', 'exemplifies': 'is_exemplified_by', 'is_exemplified_by': 'exemplifies', 'role': 'involved', 'involved': 'role', 'agent': 'involved_agent', 'involved_agent': 'agent', 'patient': 'involved_patient', 'involved_patient': 'patient', 'result': 'involved_result', 'involved_result': 'result', 'instrument': 'involved_instrument', 'involved_instrument': 'instrument', 'location': 'involved_location', 'involved_location': 'location', 'direction': 'involved_direction', 'involved_direction': 'direction', 'target_direction': 'involved_target_direction', 'involved_target_direction': 'target_direction', 'source_direction': 'involved_source_direction', 'involved_source_direction': 'source_direction', 'co_role': 'co_role', 'co_agent_patient': 'co_patient_agent', 'co_patient_agent': 'co_agent_patient', 'co_agent_instrument': 'co_instrument_agent', 'co_instrument_agent': 'co_agent_instrument', 'co_agent_result': 'co_result_agent', 'co_result_agent': 'co_agent_result', 'co_patient_instrument': 'co_instrument_patient', 'co_instrument_patient': 'co_patient_instrument', 'co_result_instrument': 'co_instrument_result', 'co_instrument_result': 'co_result_instrument', 'pertainym': 'pertainym', 'derivation': 'derivation', 'simple_aspect_ip': 'simple_aspect_pi', 'simple_aspect_pi': 'simple_aspect_ip', 'secondary_aspect_ip': 'secondary_aspect_pi', 'secondary_aspect_pi': 'secondary_aspect_ip', 'feminine': 'has_feminine', 'has_feminine': 'feminine', 'masculine': 'has_masculine', 'has_masculine': 'masculine', 'young': 'has_young', 'has_young': 'young', 'diminutive': 'has_diminutive', 'has_diminutive': 'diminutive', 'augmentative': 'has_augmentative', 'has_augmentative': 'augmentative', 'anto_gradable': 'anto_gradable', 'anto_simple': 'anto_simple', 'anto_converse': 'anto_converse', 'ir_synonym': 'ir_synonym', } .. _parts-of-speech: Parts of Speech --------------- .. data:: PARTS_OF_SPEECH - ``n`` -- Noun - ``v`` -- Verb - ``a`` -- Adjective - ``r`` -- Adverb - ``s`` -- Adjective Satellite - ``t`` -- Phrase - ``c`` -- Conjunction - ``p`` -- Adposition - ``x`` -- Other - ``u`` -- Unknown .. autodata:: NOUN .. autodata:: VERB .. autodata:: ADJECTIVE .. data:: ADJ Alias of :py:data:`ADJECTIVE` .. autodata:: ADJECTIVE_SATELLITE .. data:: ADJ_SAT Alias of :py:data:`ADJECTIVE_SATELLITE` .. autodata:: PHRASE .. autodata:: CONJUNCTION .. data:: CONJ Alias of :py:data:`CONJUNCTION` .. autodata:: ADPOSITION .. autodata:: ADP Alias of :py:data:`ADPOSITION` .. autodata:: OTHER .. autodata:: UNKNOWN Adjective Positions ------------------- .. data:: ADJPOSITIONS - ``a`` -- Attributive - ``ip`` -- Immediate Postnominal - ``p`` -- Predicative Lexicographer Files ------------------- .. data:: LEXICOGRAPHER_FILES .. code-block:: python { 'adj.all': 0, 'adj.pert': 1, 'adv.all': 2, 'noun.Tops': 3, 'noun.act': 4, 'noun.animal': 5, 'noun.artifact': 6, 'noun.attribute': 7, 'noun.body': 8, 'noun.cognition': 9, 'noun.communication': 10, 'noun.event': 11, 'noun.feeling': 12, 'noun.food': 13, 'noun.group': 14, 'noun.location': 15, 'noun.motive': 16, 'noun.object': 17, 'noun.person': 18, 'noun.phenomenon': 19, 'noun.plant': 20, 'noun.possession': 21, 'noun.process': 22, 'noun.quantity': 23, 'noun.relation': 24, 'noun.shape': 25, 'noun.state': 26, 'noun.substance': 27, 'noun.time': 28, 'verb.body': 29, 'verb.change': 30, 'verb.cognition': 31, 'verb.communication': 32, 'verb.competition': 33, 'verb.consumption': 34, 'verb.contact': 35, 'verb.creation': 36, 'verb.emotion': 37, 'verb.motion': 38, 'verb.perception': 39, 'verb.possession': 40, 'verb.social': 41, 'verb.stative': 42, 'verb.weather': 43, 'adj.ppl': 44, } wn-1.0.0/docs/api/wn.ic.rst000066400000000000000000000156001513755206300154160ustar00rootroot00000000000000 wn.ic ===== .. automodule:: wn.ic The mathematical formulae for information content are defined in `Formal Description`_, and the corresponding Python API function are described in `Calculating Information Content`_. These functions require information content weights obtained either by `computing them from a corpus `_, or by `loading pre-computed weights from a file `_. .. note:: The term *information content* can be ambiguous. It often, and most accurately, refers to the result of the :func:`information_content` function (:math:`\text{IC}(c)` in the mathematical notation), but is also sometimes used to refer to the corpus frequencies/weights (:math:`\text{freq}(c)` in the mathematical notation) returned by :func:`load` or :func:`compute`, as these weights are the basis of the value computed by :func:`information_content`. The Wn documentation tries to consistently refer to former as the *information content value*, or just *information content*, and the latter as *information content weights*, or *weights*. Formal Description ------------------ The Information Content (IC) of a concept (synset) is a measure of its specificity computed from the wordnet's taxonomy structure and corpus frequencies. It is defined by Resnik 1995 ([RES95]_), following information theory, as the negative log-probability of a concept: .. math:: \text{IC}(c) = -\log{p(c)} A concept's probability is the empirical probability over a corpus: .. math:: p(c) = \frac{\text{freq}(c)}{N} Here, :math:`N` is the total count of words of the same category as concept :math:`c` ([RES95]_ only considered nouns) where each word has some representation in the wordnet, and :math:`\text{freq}` is defined as the sum of corpus counts of words in :math:`\text{words}(c)`, which is the set of words subsumed by concept :math:`c`: .. math:: \text{freq}(c) = \sum_{w \in \text{words}(c)}{\text{count}(w)} It is common for :math:`\text{freq}` to not contain actual frequencies but instead weights distributed evenly among the synsets for a word. These weights are calculated as the word frequency divided by the number of synsets for the word: .. math:: \text{freq}_{\text{distributed}}(c) = \sum_{w \in \text{words}(c)}{\frac{\text{count}(w)}{|\text{synsets}(w)|}} .. [RES95] Resnik, Philip. "Using information content to evaluate semantic similarity." In Proceedings of the 14th International Joint Conference on Artificial Intelligence (IJCAI-95), Montreal, Canada, pp. 448-453. 1995. Example ------- In the Princeton WordNet 3.0 (hereafter *WordNet*, but note that the equivalent lexicon in Wn is the *OMW English Wordnet based on WordNet 3.0* with specifier ``omw-en:1.4``), the frequency of a concept like **stone fruit** is not just the number of occurrences of *stone fruit*, but also includes the counts of the words for its hyponyms (*almond*, *olive*, etc.) and other taxonomic descendants (*Jordan almond*, *green olive*, etc.). The word *almond* has two synsets: one for the fruit or nut, another for the plant. Thus, if the word *almond* is encountered :math:`n` times in a corpus, then the weight (either the frequency :math:`n` or distributed weight :math:`\frac{n}{2}`) is added to the total weights for both synsets and to those of their ancestors, but not for descendant synsets, such as for **Jordan almond**. The fruit/nut synset of almond has two hypernym paths which converge on **fruit**: 1. **almond** ⊃ **stone fruit** ⊃ **fruit** 2. **almond** ⊃ **nut** ⊃ **seed** ⊃ **fruit** The weight is added to each ancestor (**stone fruit**, **nut**, **seed**, **fruit**, ...) once. That is, the weight is not added to the convergent ancestor for **fruit** twice, but only once. Calculating Information Content ------------------------------- .. autofunction:: information_content .. autofunction:: synset_probability Computing Corpus Weights ------------------------ If pre-computed weights are not available for a wordnet or for some domain, they can be computed given a corpus and a wordnet. The corpus is an iterable of words. For large corpora it may help to use a generator for this iterable, but the entire vocabulary (i.e., unique words and counts) will be held at once in memory. Multi-word expressions are also possible if they exist in the wordnet. For instance, WordNet has *stone fruit*, with a single space delimiting the words, as an entry. The :class:`wn.Wordnet` object must be instantiated with a single lexicon, although it may have expand-lexicons for relation traversal. For best results, the wordnet should use a lemmatizer to help it deal with inflected wordforms from running text. .. autofunction:: compute Reading Pre-computed Information Content Files ---------------------------------------------- The :func:`load` function reads pre-computed information content weights files as used by the `WordNet::Similarity `_ Perl module or the `NLTK `_ Python package. These files are computed for a specific version of a wordnet using the synset offsets from the `WNDB `_ format, which Wn does not use. These offsets therefore must be converted into an identifier that matches those used by the wordnet. By default, :func:`load` uses the lexicon identifier from its *wordnet* argument with synset offsets (padded with 0s to make 8 digits) and parts-of-speech from the weights file to format an identifier, such as ``omw-en-00001174-n``. For wordnets that use a different identifier scheme, the *get_synset_id* parameter of :func:`load` can be given a callable created with :func:`wn.util.synset_id_formatter`. It can also be given another callable with the same signature as shown below: .. code-block:: python get_synset_id(*, offset: int, pos: str) -> str When loading pre-computed information content files, it is recommended to use the ones with smoothing (i.e., ``*-add1.dat`` or ``*-resnik-add1.dat``) to avoid math domain errors when computing the information content value. .. warning:: The weights files are only valid for the version of wordnet for which they were created. Files created for WordNet 3.0 do not work for WordNet 3.1 because the offsets used in its identifiers are different, although the *get_synset_id* parameter of :func:`load` could be given a function that performs a suitable mapping. Some `Open Multilingual Wordnet `_ wordnets use the WordNet 3.0 offsets in their identifiers and can therefore technically use the weights, but this usage is discouraged because the distributional properties of text in another language and the structure of the other wordnet will not be compatible with that of the English WordNet. For these cases, it is recommended to compute new weights using :func:`compute`. .. autofunction:: load wn-1.0.0/docs/api/wn.ili.rst000066400000000000000000000047511513755206300156050ustar00rootroot00000000000000wn.ili ====== .. automodule:: wn.ili .. note:: See :doc:`../guides/interlingual` for background and usage information about ILIs. Functions for Getting ILI Objects --------------------------------- The following functions are for getting individual :class:`ILI` and :class:`ProposedILI` objects from ILI identifiers or synsets, respectively, or to list all such known objects. .. autofunction:: get .. autofunction:: get_all .. autofunction:: get_proposed .. autofunction:: get_all_proposed ILI Status ---------- The status of an ILI object (:attr:`ILI.status` or :attr:`ProposedILI.status`) indicates what is known about its validity. Explicit information about ILIs can be added to Wn with :func:`wn.add` (e.g., :python:`wn.add("cili")`), but without it Wn can only make a guess. If a lexicon has synsets referencing some ILI identifier and no ILI file has been loaded, that ILI would have a status of :attr:`ILIStatus.PRESUPPOSED`. If an ILI file has been loaded that lists the identifier, it would have a status of :attr:`ILIStatus.ACTIVE`, whether or not a lexicon has been added that uses the ILI. Both of these cases use :class:`ILI` objects. A synset in the WN-LMF format may also propose a new ILI. It won't have an identifier, but it should have a definition. These have the status of :attr:`ILIStatus.PROPOSED`. The :class:`ProposedILI` is used for these objects, and that is the only status they have. The :attr:`ILIStatus.UNKNOWN` status is just a default (e.g., when manually creating an :class:`ILI` object) and won't be encountered in normal scenarios. .. autoclass:: ILIStatus .. autoattribute:: UNKNOWN .. autoattribute:: ACTIVE .. autoattribute:: PRESUPPOSED .. autoattribute:: PROPOSED ILI Classes ----------- .. autoclass:: ILI .. autoattribute:: id The ILI identifier. .. autoattribute:: status The status of the ILI. .. automethod:: definition .. autoclass:: ProposedILI .. autoproperty:: id .. autoproperty:: status .. automethod:: definition .. automethod:: synset .. automethod:: lexicon ILI Definitions --------------- Most likely someone inspecting the definition of an :class:`ILI` or :class:`ProposedILI` only cares about the definition text, but for completeness' sake the :class:`ILIDefinition` object models the text along with any metadata that may have appeared in the WN-LMF lexicon file. ILI files do not currently model metadata. .. autoclass:: ILIDefinition .. autoattribute:: text .. automethod:: metadata wn-1.0.0/docs/api/wn.lmf.rst000066400000000000000000000001711513755206300155760ustar00rootroot00000000000000 wn.lmf ====== .. automodule:: wn.lmf .. autofunction:: load .. autofunction:: scan_lexicons .. autofunction:: is_lmf wn-1.0.0/docs/api/wn.morphy.rst000066400000000000000000000062121513755206300163400ustar00rootroot00000000000000 wn.morphy ========= .. automodule:: wn.morphy .. seealso:: The Princeton WordNet `documentation `_ describes the original implementation of Morphy. The :doc:`../guides/lemmatization` guide describes how Wn handles lemmatization in general. Initialized and Uninitialized Morphy ------------------------------------ There are two ways of using Morphy in Wn: initialized and uninitialized. Unintialized Morphy is a simple callable that returns lemma *candidates* for some given wordform. That is, the results might not be valid lemmas, but this is not a problem in practice because subsequent queries against the database will filter out the invalid ones. This callable is obtained by creating a :class:`Morphy` object with no arguments: >>> from wn import morphy >>> m = morphy.Morphy() As an uninitialized Morphy cannot predict which lemmas in the result are valid, it always returns the original form and any transformations it can find for each part of speech: >>> m('lemmata', pos='n') # exceptional form {'n': {'lemmata'}} >>> m('lemmas', pos='n') # regular morphology with part-of-speech {'n': {'lemma', 'lemmas'}} >>> m('lemmas') # regular morphology for any part-of-speech {None: {'lemmas'}, 'n': {'lemma'}, 'v': {'lemma'}} >>> m('wolves') # invalid forms may be returned {None: {'wolves'}, 'n': {'wolf', 'wolve'}, 'v': {'wolve', 'wolv'}} This lemmatizer can also be used with a :class:`wn.Wordnet` object to expand queries: >>> import wn >>> ewn = wn.Wordnet('ewn:2020') >>> ewn.words('lemmas') [] >>> ewn = wn.Wordnet('ewn:2020', lemmatizer=morphy.Morphy()) >>> ewn.words('lemmas') [Word('ewn-lemma-n')] An initialized Morphy is created with a :class:`wn.Wordnet` object as its argument. It then uses the wordnet to build lists of valid lemmas and exceptional forms (this takes a few seconds). Once this is done, it will only return lemmas it knows about: >>> ewn = wn.Wordnet('ewn:2020') >>> m = morphy.Morphy(ewn) >>> m('lemmata', pos='n') # exceptional form {'n': {'lemma'}} >>> m('lemmas', pos='n') # regular morphology with part-of-speech {'n': {'lemma'}} >>> m('lemmas') # regular morphology for any part-of-speech {'n': {'lemma'}} >>> m('wolves') # invalid forms are pre-filtered {'n': {'wolf'}} In order to use an initialized Morphy lemmatizer with a :class:`wn.Wordnet` object, it must be assigned to the object after creation: >>> ewn = wn.Wordnet('ewn:2020') # default: lemmatizer=None >>> ewn.words('lemmas') [] >>> ewn.lemmatizer = morphy.Morphy(ewn) >>> ewn.words('lemmas') [Word('ewn-lemma-n')] There is little to no difference in the results obtained from a :class:`wn.Wordnet` object using an initialized or uninitialized :class:`Morphy` object, but there may be slightly different performance profiles for future queries. Default Morphy Lemmatizer ------------------------- As a convenience, an uninitialized Morphy lemmatizer is provided in this module via the :data:`morphy` member. .. data:: morphy A :class:`Morphy` object created without a :class:`wn.Wordnet` object. The Morphy Class ---------------- .. autoclass:: Morphy wn-1.0.0/docs/api/wn.project.rst000066400000000000000000000016671513755206300165010ustar00rootroot00000000000000wn.project ========== .. automodule:: wn.project .. autofunction:: get_project .. autofunction:: iterpackages .. autofunction:: is_package_directory .. autofunction:: is_collection_directory Project Classes --------------- Projects can be simple resource files, :class:`Package` directories, or :class:`Collection` directories. For API consistency, resource files are modeled as a virtual package (:class:`ResourceOnlyPackage`). .. class:: Project The base class for packages and collections. This class is not used directly, but all subclasses will implement the methods listed here. .. autoproperty:: path .. automethod:: readme .. automethod:: license .. automethod:: citation .. autoclass:: Package :show-inheritance: .. autoproperty:: type .. automethod:: resource_file .. autoclass:: ResourceOnlyPackage :show-inheritance: .. autoclass:: Collection :show-inheritance: .. automethod:: packages wn-1.0.0/docs/api/wn.rst000066400000000000000000000304771513755206300150350ustar00rootroot00000000000000 wn === .. automodule:: wn Project Management Functions ---------------------------- .. autofunction:: download .. autofunction:: add .. autofunction:: add_lexical_resource .. autofunction:: remove .. autofunction:: export .. autofunction:: projects .. autofunction:: reset_database Wordnet Query Functions ----------------------- While it is best to first instantiate a :class:`Wordnet` object with a specific lexicon and use that for querying (see :ref:`default-mode`), the following functions are also available for quick and simple queries. .. autofunction:: word .. autofunction:: words .. autofunction:: lemmas .. autofunction:: sense .. autofunction:: senses .. autofunction:: synset .. autofunction:: synsets .. autofunction:: lexicons The Wordnet Class ----------------- .. autoclass:: Wordnet .. automethod:: word .. automethod:: words .. automethod:: lemmas .. automethod:: sense .. automethod:: senses .. automethod:: synset .. automethod:: synsets .. automethod:: lexicons .. automethod:: expanded_lexicons .. automethod:: describe Words, Senses, and Synsets -------------------------- The results of primary queries against a lexicon are :class:`Word`, :class:`Sense`, or :class:`Synset` objects. See :doc:`../guides/wordnet` for more information about the concepts these object represent. Word Objects '''''''''''' .. class:: Word :class:`Word` (or "lexical entry") objects encode information about word forms independent from their meaning. .. autoattribute:: id The identifier used within a lexicon. .. autoattribute:: pos The part of speech of the Word. .. automethod:: lemma .. automethod:: forms .. automethod:: senses .. automethod:: synsets .. automethod:: lexicon .. automethod:: metadata .. automethod:: confidence .. automethod:: derived_words .. automethod:: translate Sense Objects ''''''''''''' .. class:: Sense :class:`Sense` objects represent a pairing of a :class:`Word` and a :class:`Synset`. .. autoattribute:: id The identifier used within a lexicon. .. automethod:: word .. automethod:: synset .. automethod:: examples .. automethod:: lexicalized .. automethod:: adjposition .. automethod:: frames .. automethod:: counts .. automethod:: lexicon .. automethod:: metadata .. automethod:: confidence .. automethod:: relations .. automethod:: synset_relations .. automethod:: get_related .. automethod:: get_related_synsets .. automethod:: closure .. automethod:: relation_paths .. automethod:: translate Synset Objects '''''''''''''' .. class:: Synset :class:`Synset` objects represent a set of words that share a meaning. .. autoattribute:: id The identifier used within a lexicon. .. autoattribute:: pos The part of speech of the Synset. .. autoproperty:: ili The interlingual index of the Synset. .. automethod:: definition .. automethod:: definitions .. automethod:: examples .. automethod:: senses .. automethod:: lexicalized .. automethod:: lexfile .. automethod:: lexicon .. automethod:: metadata .. automethod:: confidence .. automethod:: words .. automethod:: lemmas .. automethod:: hypernyms .. automethod:: hyponyms .. automethod:: holonyms .. automethod:: meronyms .. automethod:: relations .. automethod:: get_related .. automethod:: closure .. automethod:: relation_paths .. automethod:: translate .. The taxonomy methods below have been moved to wn.taxonomy .. method:: hypernym_paths(simulate_root=False) Shortcut for :func:`wn.taxonomy.hypernym_paths`. .. method:: min_depth(simulate_root=False) Shortcut for :func:`wn.taxonomy.min_depth`. .. method:: max_depth(simulate_root=False) Shortcut for :func:`wn.taxonomy.max_depth`. .. method:: shortest_path(other, simulate_root=False) Shortcut for :func:`wn.taxonomy.shortest_path`. .. method:: common_hypernyms(other, simulate_root=False) Shortcut for :func:`wn.taxonomy.common_hypernyms`. .. method:: lowest_common_hypernyms(other, simulate_root=False) Shortcut for :func:`wn.taxonomy.lowest_common_hypernyms`. Relations --------- The :meth:`Sense.relation_map` and :meth:`Synset.relation_map` methods return a dictionary mapping :class:`Relation` objects to resolved target senses or synsets. They differ from :meth:`Sense.relations` and :meth:`Synset.relations` in two main ways: 1. Relation objects map 1-to-1 to their targets instead of to a list of targets sharing the same relation name. 2. Relation objects encode not just relation names, but also the identifiers of sources and targets, the lexicons they came from, and any metadata they have. One reason why :class:`Relation` objects are useful is for inspecting relation metadata, particularly in order to distinguish ``other`` relations that differ only by the value of their ``dc:type`` metadata: >>> oewn = wn.Wordnet('oewn:2024') >>> alloy = oewn.senses("alloy", pos="v")[0] >>> alloy.relations() # appears to only have one 'other' relation {'derivation': [Sense('oewn-alloy__1.27.00..')], 'other': [Sense('oewn-alloy__1.27.00..')]} >>> for rel in alloy.relation_map(): # but in fact there are two ... print(rel, rel.subtype) ... Relation('derivation', 'oewn-alloy__2.30.00..', 'oewn-alloy__1.27.00..') None Relation('other', 'oewn-alloy__2.30.00..', 'oewn-alloy__1.27.00..') material Relation('other', 'oewn-alloy__2.30.00..', 'oewn-alloy__1.27.00..') result Another reason why they are useful is to determine the source of a relation used in :doc:`interlingual queries <../guides/interlingual>`. >>> es = wn.Wordnet("omw-es", expand="omw-en") >>> mapa = es.synsets("mapa", pos="n")[0] >>> rel, tgt = next(iter(mapa.relation_map().items())) >>> rel, rel.lexicon() # relation comes from omw-en (Relation('hypernym', 'omw-en-03720163-n', 'omw-en-04076846-n'), ) >>> tgt, tgt.words(), tgt.lexicon() # target is in omw-es (Synset('omw-es-04076846-n'), [Word('omw-es-representación-n')], ) .. class:: Relation :class:`Relation` objects model relations between senses or synsets. .. attribute:: name The name of the relation. Also called the relation "type". .. attribute:: source_id The identifier of the source entity of the relation. .. attribute:: target_id The identifier of the target entity of the relation. .. autoattribute:: subtype .. automethod:: lexicon .. automethod:: metadata .. automethod:: confidence Additional Classes ------------------ .. class:: Form :class:`Form` objects are returned by :meth:`Word.lemma` and :meth:`Word.forms` when the :python:`data=True` argument is used, and they make accessible several optional properties of word forms. The word form itself is available via the :attr:`value` attribute. >>> inu = wn.words('犬', lexicon='wnja')[0] >>> inu.forms(data=True)[3] Form(value='いぬ') >>> inu.forms(data=True)[3].script 'hira' The :attr:`script` is often unspecified (i.e., :python:`None`) and this carries the implicit meaning that the form uses the canonical script for the word's language or wordnet, whatever it may be. .. attribute:: value The word form string. .. attribute:: id An optional form identifier used within a lexicon. These identifiers are often :python:`None`. .. attribute:: script The script of the word form. This should be an `ISO 15924 `_ code, or :python:`None`. .. method:: pronunciations Return the list of :class:`Pronunciation` objects. .. method:: tags Return the list of :class:`Tag` objects. .. automethod:: lexicon .. class:: Pronunciation :class:`Pronunciation` objects encode a text or audio representation of how a word is pronounced. They are returned by :meth:`Form.pronunciations`. .. autoattribute:: value The encoded pronunciation. .. autoattribute:: variety The language variety this pronunciation belongs to. .. autoattribute:: notation The notation used to encode the pronunciation. For example: the International Phonetic Alphabet (IPA). .. autoattribute:: phonemic :python:`True` when the encoded pronunciation is a generalized phonemic description, or :python:`False` for more precise phonetic transcriptions. .. autoattribute:: audio A URI to an associated audio file. .. automethod:: lexicon .. autoclass:: Tag :class:`Tag` objects encode categorical information about word forms. They are returned by :meth:`Form.tags`. .. autoattribute:: tag The text value of the tag. .. autoattribute:: category The category, or kind, of the tag. .. automethod:: lexicon .. autoclass:: Count :class:`Count` objects model sense counts previously computed over some corpus. They are returned by :meth:`Sense.counts`. .. autoattribute:: value The count of sense occurrences. .. automethod:: lexicon .. automethod:: metadata .. automethod:: confidence .. class:: Example :class:`Example` objects model example phrases for senses and synsets. They are returned by :meth:`Sense.examples` and :meth:`Synset.examples` when the :python:`data=True` argument is given. .. autoattribute:: text The example text. .. autoattribute:: language The language of the example. .. automethod:: lexicon .. automethod:: metadata .. automethod:: confidence .. class:: Definition :class:`Definition` objects model synset definitions. They are returned by :meth:`Synset.definition` when the :python:`data=True` argument is given. .. autoattribute:: text The example text. .. autoattribute:: language The language of the example. .. autoattribute:: source_sense_id The id of the particular sense the definition is for. .. automethod:: lexicon .. automethod:: metadata .. automethod:: confidence Interlingual Indices -------------------- As of Wn v1.0.0, see :mod:`wn.ili` classes and functions for ILIs Lexicon Objects --------------- .. class:: Lexicon Lexicon objects contain attributes and metadata about a single :doc:`lexicon <../guides/lexicons>`. .. autoattribute:: id The lexicon's identifier. .. autoattribute:: label The full name of lexicon. .. autoattribute:: language The BCP 47 language code of lexicon. .. autoattribute:: email The email address of the wordnet maintainer. .. autoattribute:: license The URL or name of the wordnet's license. .. autoattribute:: version The version string of the resource. .. autoattribute:: url The project URL of the wordnet. .. autoattribute:: citation The canonical citation for the project. .. autoattribute:: logo A URL or path to a project logo. .. automethod:: metadata .. automethod:: confidence .. automethod:: specifier .. automethod:: modified .. automethod:: requires .. automethod:: extends .. automethod:: extensions .. automethod:: describe The wn.config Object -------------------- Wn's data storage and retrieval can be configured through the :data:`wn.config` object. .. seealso:: :doc:`../setup` describes how to configure Wn using the :data:`wn.config` instance. .. autodata:: config It is an instance of the :class:`~wn._config.WNConfig` class, which is defined in a non-public module and is not meant to be instantiated directly. Configuration should occur through the single :data:`wn.config` instance. .. autoclass:: wn._config.WNConfig .. autoattribute:: data_directory .. autoattribute:: database_path .. attribute:: allow_multithreading If set to :python:`True`, the database connection may be shared across threads. In this case, it is the user's responsibility to ensure that multiple threads don't try to write to the database at the same time. The default is :python:`False`. .. autoattribute:: downloads_directory .. automethod:: add_project .. automethod:: add_project_version .. automethod:: get_project_info .. automethod:: get_cache_path .. automethod:: update .. automethod:: load_index Exceptions ---------- .. autoexception:: Error .. autoexception:: DatabaseError .. autoexception:: WnWarning wn-1.0.0/docs/api/wn.similarity.rst000066400000000000000000000137061513755206300172160ustar00rootroot00000000000000wn.similarity ============= .. automodule:: wn.similarity Taxonomy-based Metrics ---------------------- The `Path `_, `Leacock-Chodorow `_, and `Wu-Palmer `_ similarity metrics work by finding path distances in the hypernym/hyponym taxonomy. As such, they are most useful when the synsets are, in fact, arranged in a taxonomy. For the Princeton WordNet and derivative wordnets, such as the `Open English Wordnet`_ and `OMW English Wordnet based on WordNet 3.0`_ available to Wn, synsets for nouns and verbs are arranged taxonomically: the nouns mostly form a single structure with a single root while verbs form many smaller structures with many roots. Synsets for the other parts of speech do not use hypernym/hyponym relations at all. This situation may be different for other wordnet projects or future versions of the English wordnets. .. _Open English Wordnet: https://en-word.net .. _OMW English Wordnet based on WordNet 3.0: https://github.com/omwn/omw-data The similarity metrics tend to fail when the synsets are not connected by some path. When the synsets are in different parts of speech, or even in separate lexicons, this failure is acceptable and expected. But for cases like the verbs in the Princeton WordNet, it might be more useful to pretend that there is some unique root for all verbs so as to create a path connecting any two of them. For this purpose, the *simulate_root* parameter is available on the :func:`path`, :func:`lch`, and :func:`wup` functions, where it is passed on to calls to :meth:`wn.Synset.shortest_path` and :meth:`wn.Synset.lowest_common_hypernyms`. Setting *simulate_root* to :python:`True` can, however, give surprising results if the words are from a different lexicon. Currently, computing similarity for synsets from a different part of speech raises an error. Path Similarity ''''''''''''''' When :math:`p` is the length of the shortest path between two synsets, the path similarity is: .. math:: \frac{1}{p + 1} The similarity score ranges between 0.0 and 1.0, where the higher the score is, the more similar the synsets are. The score is 1.0 when a synset is compared to itself, and 0.0 when there is no path between the two synsets (i.e., the path distance is infinite). .. autofunction:: path .. _leacock-chodorow-similarity: Leacock-Chodorow Similarity ''''''''''''''''''''''''''' When :math:`p` is the length of the shortest path between two synsets and :math:`d` is the maximum taxonomy depth, the Leacock-Chodorow similarity is: .. math:: -\text{log}\left(\frac{p + 1}{2d}\right) .. autofunction:: lch Wu-Palmer Similarity '''''''''''''''''''' When *LCS* is the lowest common hypernym (also called "least common subsumer") between two synsets, :math:`i` is the shortest path distance from the first synset to *LCS*, :math:`j` is the shortest path distance from the second synset to *LCS*, and :math:`k` is the number of nodes (distance + 1) from *LCS* to the root node, then the Wu-Palmer similarity is: .. math:: \frac{2k}{i + j + 2k} .. autofunction:: wup Information Content-based Metrics --------------------------------- The `Resnik `_, `Jiang-Conrath `_, and `Lin `_ similarity metrics work by computing the information content of the synsets and/or that of their lowest common hypernyms. They therefore require information content weights (see :mod:`wn.ic`), and the values returned necessarily depend on the weights used. Resnik Similarity ''''''''''''''''' The Resnik similarity (`Resnik 1995 `_) is the maximum information content value of the common subsumers (hypernym ancestors) of the two synsets. Formally it is defined as follows, where :math:`c_1` and :math:`c_2` are the two synsets being compared. .. math:: \text{max}_{c \in \text{S}(c_1, c_2)} \text{IC}(c) Since a synset's information content is always equal or greater than the information content of its hypernyms, :math:`S(c_1, c_2)` above is more efficiently computed using the lowest common hypernyms instead of all common hypernyms. .. autofunction:: res Jiang-Conrath Similarity '''''''''''''''''''''''' The Jiang-Conrath similarity metric (`Jiang and Conrath, 1997 `_) combines the ideas of the taxonomy-based and information content-based metrics. It is defined as follows, where :math:`c_1` and :math:`c_2` are the two synsets being compared and :math:`c_0` is the lowest common hypernym of the two with the highest information content weight: .. math:: \frac{1}{\text{IC}(c_1) + \text{IC}(c_2) - 2(\text{IC}(c_0))} This equation is the simplified form given in the paper were several parameterized terms are cancelled out because the full form is not often used in practice. There are two special cases: 1. If the information content of :math:`c_0`, :math:`c_1`, and :math:`c_2` are all zero, the metric returns zero. This occurs when both :math:`c_1` and :math:`c_2` are the root node, but it can also occur if the synsets did not occur in the corpus and the smoothing value was set to zero. 2. Otherwise if :math:`c_1 + c_2 = 2c_0`, the metric returns infinity. This occurs when the two synsets are the same, one is a descendant of the other, etc., such that they have the same frequency as each other and as their lowest common hypernym. .. autofunction:: jcn Lin Similarity '''''''''''''' Another formulation of information content-based similarity is the Lin metric (`Lin 1997 `_), which is defined as follows, where :math:`c_1` and :math:`c_2` are the two synsets being compared and :math:`c_0` is the lowest common hypernym with the highest information content weight: .. math:: \frac{2(\text{IC}(c_0))}{\text{IC}(c_1) + \text{IC}(c_0)} One special case is if either synset has an information content value of zero, in which case the metric returns zero. .. autofunction:: lin wn-1.0.0/docs/api/wn.taxonomy.rst000066400000000000000000000042361513755206300167040ustar00rootroot00000000000000 wn.taxonomy =========== .. automodule:: wn.taxonomy Overview -------- Among the valid synset relations for wordnets (see :data:`wn.constants.SYNSET_RELATIONS`), those used for describing *is-a* `taxonomies `_ are given special treatment and they are generally the most well-developed relations in any wordnet. Typically these are the ``hypernym`` and ``hyponym`` relations, which encode *is-a-type-of* relationships (e.g., a *hermit crab* is a type of *decapod*, which is a type of *crustacean*, etc.). They also include ``instance_hypernym`` and ``instance_hyponym``, which encode *is-an-instance-of* relationships (e.g., *Oregon* is an instance of *American state*). The taxonomy forms a multiply-inheriting hierarchy with the synsets as nodes. In the English wordnets, such as the Princeton WordNet and its derivatives, nearly all nominal synsets form such a hierarchy with single root node, while verbal synsets form many smaller hierarchies without a common root. Other wordnets may have different properties, but as many are based off of the Princeton WordNet, they tend to follow this structure. Functions to find paths within the taxonomies form the basis of all :mod:`wordnet similarity measures `. For instance, the :ref:`leacock-chodorow-similarity` measure uses both :func:`shortest_path` and (indirectly) :func:`taxonomy_depth`. Wordnet-level Functions ----------------------- Root and leaf synsets in the taxonomy are those with no ancestors (``hypernym``, ``instance_hypernym``, etc.) or hyponyms (``hyponym``, ``instance_hyponym``, etc.), respectively. Finding root and leaf synsets ''''''''''''''''''''''''''''' .. autofunction:: roots .. autofunction:: leaves Computing the taxonomy depth '''''''''''''''''''''''''''' The taxonomy depth is the maximum depth from a root node to a leaf node within synsets for a particular part of speech. .. autofunction:: taxonomy_depth Synset-level Functions ---------------------- .. autofunction:: hypernym_paths .. autofunction:: min_depth .. autofunction:: max_depth .. autofunction:: shortest_path .. autofunction:: common_hypernyms .. autofunction:: lowest_common_hypernyms wn-1.0.0/docs/api/wn.util.rst000066400000000000000000000011641513755206300160000ustar00rootroot00000000000000wn.util ======= .. automodule:: wn.util .. autofunction:: synset_id_formatter .. autoclass:: ProgressHandler :members: .. attribute:: kwargs A dictionary storing the updateable parameters for the progress handler. The keys are: - ``message`` (:class:`str`) -- a generic message or name - ``count`` (:class:`int`) -- the current progress counter - ``total`` (:class:`int`) -- the expected final value of the counter - ``unit`` (:class:`str`) -- the unit of measurement - ``status`` (:class:`str`) -- the current status of the process .. autoclass:: ProgressBar :members: wn-1.0.0/docs/api/wn.validate.rst000066400000000000000000000001221513755206300166050ustar00rootroot00000000000000 wn.validate =========== .. automodule:: wn.validate .. autofunction:: validate wn-1.0.0/docs/cli.rst000066400000000000000000000063021513755206300143750ustar00rootroot00000000000000Command Line Interface ====================== Some of Wn's functionality is exposed via the command line. Global Options -------------- .. option:: -d DIR, --dir DIR Change to use ``DIR`` as the data directory prior to invoking any commands. Subcommands ----------- download -------- Download and add projects to the database given one or more project specifiers or URLs. .. code-block:: console $ python -m wn download oewn:2021 omw:1.4 cili $ python -m wn download https://en-word.net/static/english-wordnet-2021.xml.gz .. option:: --index FILE Use the index at ``FILE`` to resolve project specifiers. .. code-block:: console $ python -m wn download --index my-index.toml mywn .. option:: --no-add Download and cache the remote file, but don't add it to the database. lexicons -------- The ``lexicons`` subcommand lets you quickly see what is installed: .. code-block:: console $ python -m wn lexicons omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0 omw-sk 1.4 [sk] Slovak WordNet omw-pl 1.4 [pl] plWordNet omw-is 1.4 [is] IceWordNet omw-zsm 1.4 [zsm] Wordnet Bahasa (Malaysian) omw-sl 1.4 [sl] sloWNet omw-ja 1.4 [ja] Japanese Wordnet ... .. option:: -l LG, --lang LG .. option:: --lexicon SPEC The ``--lang`` or ``--lexicon`` option can help you narrow down the results: .. code-block:: console $ python -m wn lexicons --lang en oewn 2021 [en] Open English WordNet omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0 $ python -m wn lexicons --lexicon "omw-*" omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0 omw-sk 1.4 [sk] Slovak WordNet omw-pl 1.4 [pl] plWordNet omw-is 1.4 [is] IceWordNet omw-zsm 1.4 [zsm] Wordnet Bahasa (Malaysian) projects -------- The ``projects`` subcommand lists all known projects in Wn's index. This is helpful to see what is available for downloading. .. code-block:: $ python -m wn projects ic cili 1.0 [---] Collaborative Interlingual Index ic oewn 2025+ [en] Open English WordNet ic oewn 2025 [en] Open English WordNet ic oewn 2024 [en] Open English WordNet ic oewn 2023 [en] Open English WordNet ic oewn 2022 [en] Open English WordNet ic oewn 2021 [en] Open English WordNet ic ewn 2020 [en] Open English WordNet ic ewn 2019 [en] Open English WordNet ic odenet 1.4 [de] Open German WordNet i- odenet 1.3 [de] Open German WordNet ic omw 2.0 [mul] Open Multilingual Wordnet ic omw 1.4 [mul] Open Multilingual Wordnet ... validate -------- Given a path to a WN-LMF XML file, check the file for structural problems and print a report. .. code-block:: $ python -m wn validate english-wordnet-2021.xml .. option:: --select CHECKS Run the checks with the given comma-separated list of check codes or categories. .. code-block:: $ python -m wn validate --select E W201 W204 deWordNet.xml .. option:: --output-file FILE Write the report to FILE as a JSON object instead of printing the report to stdout. wn-1.0.0/docs/conf.py000066400000000000000000000072511513755206300143770ustar00rootroot00000000000000# Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- import wn project = "wn" copyright = "2020, Michael Wayne Goodman" author = "Michael Wayne Goodman" # The short X.Y version version = ".".join(wn.__version__.split(".")[:2]) # The full version, including alpha/beta/rc tags release = wn.__version__ # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.coverage", # 'sphinx.ext.viewcode', "sphinx.ext.githubpages", "sphinx.ext.napoleon", "sphinx_copybutton", ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # Global definitions rst_prolog = """ .. role:: python(code) :language: python :class: highlight """ # smartquotes = False smartquotes_action = "De" # D = en- and em-dash; e = ellipsis # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes.# html_theme = "furo" html_theme_options = { "light_css_variables": { "color-brand-primary": "#006699", "color-brand-content": "#006699", # "color-background": "#f0f0f0", # "color-sidebar-background": "#ddd", }, "dark_css_variables": { "color-brand-primary": "#00CCFF", "color-brand-content": "#00CCFF", }, } html_logo = "_static/wn-logo.svg" pygments_style = "manni" pygments_dark_style = "monokai" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] html_css_files = [ "css/svg.css", ] # Don't offer to show the source of the current page html_show_sourcelink = False # -- Options for autodoc extension ------------------------------------------- # autodoc_typehints = 'description' autodoc_typehints = "signature" # autodoc_typehints = 'none' # -- Options for intersphinx extension --------------------------------------- # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/3", None), "httpx": ("https://httpx.readthedocs.io/en/latest/", None), } # -- Options for sphinx_copybutton extension --------------------------------- copybutton_prompt_text = ( r">>> " # regular Python prompt r"|\.\.\. " # Python continuation prompt r"|\$ " # Basic shell r"|In \[\d*\]: " # Jupyter notebook ) copybutton_prompt_is_regexp = True wn-1.0.0/docs/docutils.conf000066400000000000000000000000641513755206300155700ustar00rootroot00000000000000[restructuredtext parser] syntax_highlight = short wn-1.0.0/docs/faq.rst000066400000000000000000000130531513755206300143760ustar00rootroot00000000000000FAQ === Is Wn related to the NLTK's `nltk.corpus.wordnet` module? --------------------------------------------------------- Only in spirit. There was an effort to develop the `NLTK`_\ 's module as a standalone package (see https://github.com/nltk/wordnet/), but development had slowed. Wn has the same broad goals and a similar API as that standalone package, but fundamental architectural differences demanded a complete rewrite, so Wn was created as a separate project. With approval from the other package's maintainer, Wn acquired the `wn `_ project on PyPI and can be seen as its successor. Is Wn compatible with the NLTK's module? ---------------------------------------- The API is intentionally similar, but not exactly the same (for instance see the next question), and there are differences in the ways that results are retrieved, particularly for non-English wordnets. See :doc:`guides/nltk-migration` for more information. Also see :ref:`princeton-wordnet`. Where are the ``Lemma`` objects? What are ``Word`` and ``Sense`` objects? ------------------------------------------------------------------------- Unlike the original `WNDB`_ data format of the original WordNet, the `WN-LMF`_ XML format grants words (called *lexical entries* in WN-LMF and a :class:`~wn.Word` object in Wn) and word senses (:class:`~wn.Sense` in Wn) explicit, first-class status alongside synsets. While senses are essentially links between words and synsets, they may contain metadata and be the source or target of sense relations, so in some ways they are more like nodes than edges when the wordnet is viewed as a graph. The `NLTK`_\ 's module, using the WNDB format, combines the information of a word and a sense into a single object called a ``Lemmas``. Wn also has an unrelated concept called a :meth:`~wn.Word.lemma`, but it is merely the canonical form of a word. .. _princeton-wordnet: Where is the Princeton WordNet data? ------------------------------------ The original English wordnet, named simply *WordNet* but often referred to as the *Princeton WordNet* to better distinguish it from other projects, is specifically the data distributed by Princeton in the `WNDB`_ format. The `Open Multilingual Wordnet `_ (OMW) packages an export of the WordNet data as the *OMW English Wordnet based on WordNet 3.0* which is used by Wn (with the lexicon ID ``omw-en``). It also has a similar export for WordNets 1.5, 1.6, 1.7, 1.7.1, 2.0, 2.1, and 3.1 data (``omw-en15``, ``omw-en16``, ``omw-en17``, ``omw-en171``, ``omw-en20``, ``omw-en21``, and ``omw-en31``, respectively). All of these are highly compatible with the original data and can be used as drop-in replacements. Prior to Wn version 0.9 (and, correspondingly, prior to the `OMW data`_ version 1.4), the ``pwn:3.0`` and ``pwn:3.1`` English wordnets distributed by OMW were incorrectly called the *Princeton WordNet* (for WordNet 3.0 and 3.1, respectively). From Wn version 0.9 (and from version 1.4 of the OMW data), these are called the *OMW English Wordnet based on WordNet 3.0/3.1* (``omw-en:1.4`` and ``omw-en31:1.4``, respectively). These lexicons are intentionally compatible with the original WordNet data, and the 1.4 versions are even more compatible than the previous ``pwn:3.0`` and ``pwn:3.1`` lexicons, so it is strongly recommended to use them over the previous versions. Similarly, the 2.0 version of OMW is more compatible yet. The data corresponding to WordNet versions 1.5 through 2.1 are only available from OMW 2.0. .. _OMW data: https://github.com/omwn/omw-data Why don't all wordnets share the same synsets? ---------------------------------------------- The `Open Multilingual Wordnet `_ (OMW) contains wordnets for many languages created using the *expand* methodology [VOSSEN1998]_, where non-English wordnets provide words on top of the English wordnet's synset structure. This allows new wordnets to be built in much less time than starting from scratch, but with a few drawbacks, such as that words cannot be added if they do not have a synset in the English wordnet, and that it is difficult to version the wordnets independently (e.g., for reproducibility of experiments involving wordnet data) as all are interconnected. Wn, therefore, creates new synsets for each wordnet added to its database, and synsets then specify which resource they belong to. Queries can specify which resources may be examined. Also see :doc:`guides/interlingual`. Why does Wn's database get so big? ---------------------------------- The *OMW English Wordnet based on WordNet 3.0* takes about 114 MiB of disk space in Wn's database, which is only about 8 MiB more than it takes as a `WN-LMF`_ XML file. The `NLTK`_, however, uses the obsolete `WNDB`_ format which is more compact, requiring only 35 MiB of disk space. The difference with the Open Multilingual Wordnet 1.4 is more striking: it takes about 659 MiB of disk space in the database, but only 49 MiB in the NLTK. Part of the difference here is that the OMW files in the NLTK are simple tab-separated-value files listing only the words added to each synset for each language. In addition, Wn creates new synsets for each wordnet added (see the previous question). One more reason is that Wn creates various indexes in the database for efficient lookup. .. _NLTK: https://www.nltk.org/ .. _OMW: http://github.com/omwn .. [VOSSEN1998] Piek Vossen. 1998. *Introduction to EuroWordNet.* Computers and the Humanities, 32(2): 73--89. .. _Open English Wordnet 2021: https://en-word.net/ .. _WNDB: https://wordnet.princeton.edu/documentation/wndb5wn .. _WN-LMF: https://globalwordnet.github.io/schemas/ wn-1.0.0/docs/guides/000077500000000000000000000000001513755206300143535ustar00rootroot00000000000000wn-1.0.0/docs/guides/basic.rst000066400000000000000000000212111513755206300161630ustar00rootroot00000000000000Basic Usage =========== .. seealso:: This document covers the basics of querying wordnets, filtering results, and performing secondary queries on the results. For adding, removing, or inspecting lexicons, see :doc:`lexicons`. For more information about interlingual queries, see :doc:`interlingual`. For the most basic queries, Wn provides several module functions for retrieving words, senses, and synsets: >>> import wn >>> wn.words('pike') [Word('ewn-pike-n')] >>> wn.senses('pike') [Sense('ewn-pike-n-03311555-04'), Sense('ewn-pike-n-07795351-01'), Sense('ewn-pike-n-03941974-01'), Sense('ewn-pike-n-03941726-01'), Sense('ewn-pike-n-02563739-01')] >>> wn.synsets('pike') [Synset('ewn-03311555-n'), Synset('ewn-07795351-n'), Synset('ewn-03941974-n'), Synset('ewn-03941726-n'), Synset('ewn-02563739-n')] Once you start working with multiple wordnets, these simple queries may return more than desired: >>> wn.words('pike') [Word('ewn-pike-n'), Word('wnja-n-66614')] >>> wn.words('chat') [Word('ewn-chat-n'), Word('ewn-chat-v'), Word('frawn-lex14803'), Word('frawn-lex21897')] You can specify which language or lexicon you wish to query: >>> wn.words('pike', lang='ja') [Word('wnja-n-66614')] >>> wn.words('chat', lexicon='frawn') [Word('frawn-lex14803'), Word('frawn-lex21897')] But it might be easier to create a :class:`~wn.Wordnet` object and use it for queries: >>> wnja = wn.Wordnet(lang='ja') >>> wnja.words('pike') [Word('wnja-n-66614')] >>> frawn = wn.Wordnet(lexicon='frawn') >>> frawn.words('chat') [Word('frawn-lex14803'), Word('frawn-lex21897')] In fact, the simple queries above implicitly create such a :class:`~wn.Wordnet` object, but one that includes all installed lexicons. .. _primary-queries: Primary Queries --------------- The queries shown above are "primary" queries, meaning they are the first step in a user's interaction with a wordnet. Operations performed on the resulting objects are then `secondary queries`_. Primary queries optionally take several fields for filtering the results, namely the word form and part of speech. Synsets may also be filtered by an interlingual index (ILI). Searching for Words ''''''''''''''''''' The :func:`wn.words()` function returns a list of :class:`~wn.Word` objects that match the given word form or part of speech: >>> wn.words('pencil') [Word('ewn-pencil-n'), Word('ewn-pencil-v')] >>> wn.words('pencil', pos='v') [Word('ewn-pencil-v')] Calling the function without a word form will return all words in the database: >>> len(wn.words()) 311711 >>> len(wn.words(pos='v')) 29419 >>> len(wn.words(pos='v', lexicon='ewn')) 11595 If you know the word identifier used by a lexicon, you can retrieve the word directly with the :func:`wn.word()` function. Identifiers are guaranteed to be unique within a single lexicon, but not across lexicons, so it's best to call this function from an instantiated :class:`~wn.Wordnet` object or with the ``lexicon`` parameter specified. If multiple words are found when querying multiple lexicons, only the first is returned. >>> wn.word('ewn-pencil-n', lexicon='ewn') Word('ewn-pencil-n') Searching for Senses '''''''''''''''''''' The :func:`wn.senses()` and :func:`wn.sense()` functions behave similarly to :func:`wn.words()` and :func:`wn.word()`, except that they return matching :class:`~wn.Sense` objects. >>> wn.senses('plow', pos='n') [Sense('ewn-plow-n-03973894-01')] >>> wn.sense('ewn-plow-v-01745745-01') Sense('ewn-plow-v-01745745-01') Senses represent a relationship between a :class:`~wn.Word` and a :class:`~wn.Synset`. Seen as an edge between nodes, senses are often given less prominence than words or synsets, but they are the natural locus of several interesting features such as sense relations (e.g., for derived words) and the natural level of representation for translations to other languages. Searching for Synsets ''''''''''''''''''''' The :func:`wn.synsets()` and :func:`wn.synset()` functions are like those above but allow the ``ili`` parameter for filtering by interlingual index, which is useful in interlingual queries: >>> wn.synsets('scepter') [Synset('ewn-14467142-n'), Synset('ewn-07282278-n')] >>> wn.synset('ewn-07282278-n').ili 'i74874' >>> wn.synsets(ili='i74874') [Synset('ewn-07282278-n'), Synset('wnja-07267573-n'), Synset('frawn-07267573-n')] Secondary Queries ----------------- Once you have gotten some results from a primary query, you can perform operations on the :class:`~wn.Word`, :class:`~wn.Sense`, or :class:`~wn.Synset` objects to get at further information in the wordnet. Exploring Words ''''''''''''''' Here are some of the things you can do with :class:`~wn.Word` objects: >>> w = wn.words('goose')[0] >>> w.pos # part of speech 'n' >>> w.forms() # other word forms (e.g., irregular inflections) ['goose', 'geese'] >>> w.lemma() # canonical form 'goose' >>> w.derived_words() [Word('ewn-gosling-n'), Word('ewn-goosy-s'), Word('ewn-goosey-s')] >>> w.senses() [Sense('ewn-goose-n-01858313-01'), Sense('ewn-goose-n-10177319-06'), Sense('ewn-goose-n-07662430-01')] >>> w.synsets() [Synset('ewn-01858313-n'), Synset('ewn-10177319-n'), Synset('ewn-07662430-n')] Since translations of a word into another language depend on the sense used, :meth:`Word.translate ` returns a dictionary mapping each sense to words in the target language: >>> for sense, ja_words in w.translate(lang='ja').items(): ... print(sense, ja_words) ... Sense('ewn-goose-n-01858313-01') [Word('wnja-n-1254'), Word('wnja-n-33090'), Word('wnja-n-38995')] Sense('ewn-goose-n-10177319-06') [] Sense('ewn-goose-n-07662430-01') [Word('wnja-n-1254')] Exploring Senses '''''''''''''''' Compared to :class:`~wn.Word` and :class:`~wn.Synset` objects, there are relatively few operations available on :class:`~wn.Sense` objects. Sense relations and translations, however, are important operations on senses. >>> s = wn.senses('dark', pos='n')[0] >>> s.word() # each sense links to a single word Word('ewn-dark-n') >>> s.synset() # each sense links to a single synset Synset('ewn-14007000-n') >>> s.get_related('antonym') [Sense('ewn-light-n-14006789-01')] >>> s.get_related('derivation') [Sense('ewn-dark-a-00273948-01')] >>> s.translate(lang='fr') # translation returns a list of senses [Sense('frawn-lex52992--13983515-n')] >>> s.translate(lang='fr')[0].word().lemma() 'obscurité' Exploring Synsets ''''''''''''''''' Many of the operations people care about happen on synsets, such as hierarchical relations and metrics. >>> ss = wn.synsets('hound', pos='n')[0] >>> ss.senses() [Sense('ewn-hound-n-02090203-01'), Sense('ewn-hound_dog-n-02090203-02')] >>> ss.words() [Word('ewn-hound-n'), Word('ewn-hound_dog-n')] >>> ss.lemmas() ['hound', 'hound dog'] >>> ss.definition() 'any of several breeds of dog used for hunting typically having large drooping ears' >>> ss.hypernyms() [Synset('ewn-02089774-n')] >>> ss.hypernyms()[0].lemmas() ['hunting dog'] >>> len(ss.hyponyms()) 20 >>> ss.hyponyms()[0].lemmas() ['Afghan', 'Afghan hound'] >>> ss.max_depth() 15 >>> ss.shortest_path(wn.synsets('dog', pos='n')[0]) [Synset('ewn-02090203-n'), Synset('ewn-02089774-n'), Synset('ewn-02086723-n')] >>> ss.translate(lang='fr') # translation returns a list of synsets [Synset('frawn-02087551-n')] >>> ss.translate(lang='fr')[0].lemmas() ['chien', 'chien de chasse'] Filtering by Language --------------------- The ``lang`` parameter of :func:`wn.words()`, :func:`wn.senses()`, :func:`wn.synsets()`, and :class:`~wn.Wordnet` allows a single `BCP 47 `_ language code. When this parameter is used, only entries in the specified language will be returned. >>> import wn >>> wn.words('chat') [Word('ewn-chat-n'), Word('ewn-chat-v'), Word('frawn-lex14803'), Word('frawn-lex21897')] >>> wn.words('chat', lang='fr') [Word('frawn-lex14803'), Word('frawn-lex21897')] If a language code not used by any lexicon is specified, a :exc:`wn.Error` is raised. Filtering by Lexicon -------------------- The ``lexicon`` parameter of :func:`wn.words()`, :func:`wn.senses()`, :func:`wn.synsets()`, and :class:`~wn.Wordnet` take a string of space-delimited :ref:`lexicon specifiers `. Entries in a lexicon whose ID matches one of the lexicon specifiers will be returned. For these, the following rules are used: - A full ``id:version`` string (e.g., ``ewn:2020``) selects a specific lexicon - Only a lexicon ``id`` (e.g., ``ewn``) selects the most recently added lexicon with that ID - A star ``*`` may be used to match any lexicon; a star may not include a version >>> wn.words('chat', lexicon='ewn:2020') [Word('ewn-chat-n'), Word('ewn-chat-v')] >>> wn.words('chat', lexicon='wnja') [] >>> wn.words('chat', lexicon='wnja frawn') [Word('frawn-lex14803'), Word('frawn-lex21897')] wn-1.0.0/docs/guides/images/000077500000000000000000000000001513755206300156205ustar00rootroot00000000000000wn-1.0.0/docs/guides/images/sense-sense.svg000066400000000000000000000225001513755206300205700ustar00rootroot00000000000000 image/svg+xml behavioral conduct behavior synset synset pertainym wn-1.0.0/docs/guides/images/sense-synset.svg000066400000000000000000000247661513755206300210200ustar00rootroot00000000000000 image/svg+xml pointer cursor computing computer science synset synset has domain topic wn-1.0.0/docs/guides/images/synset-synset.svg000066400000000000000000000350341513755206300212160ustar00rootroot00000000000000 image/svg+xml cab taxi hack car auto automobile synset synset hypernym wn-1.0.0/docs/guides/images/word-sense-synset.svg000066400000000000000000000423251513755206300217600ustar00rootroot00000000000000 image/svg+xml sense sense2 word synset A synset B word word sense sense1 sense wn-1.0.0/docs/guides/interlingual.rst000066400000000000000000000230251513755206300176040ustar00rootroot00000000000000Interlingual Queries ==================== This guide explains how interlingual queries work within Wn. To get started, you'll need at least two lexicons that use interlingual indices (ILIs). For this guide, we'll use the Open English WordNet (``oewn:2024``), the Open German WordNet (``odenet:1.4``), also known as OdeNet, and the Japanese wordnet (``omw-ja:1.4``). >>> import wn >>> wn.download('oewn:2024') >>> wn.download('odenet:1.4') >>> wn.download('omw-ja:1.4') We will query these wordnets with the following :class:`~wn.Wordnet` objects: >>> en = wn.Wordnet('oewn:2024') >>> de = wn.Wordnet('odenet:1.4') The object for the Japanese wordnet will be discussed and created below, in :ref:`cross-lingual-relation-traversal`. What are Interlingual Indices? ------------------------------ It is common for users of the `Princeton WordNet `_ to refer to synsets by their `WNDB `_ offset and type, but this is problematic because the offset is a byte-offset in the wordnet data files and it will differ for wordnets in other languages and even between versions of the same wordnet. Interlingual indices (ILIs) address this issue by providing stable identifiers for concepts, whether for a synset across versions of a wordnet or across languages. The idea of ILIs was proposed by [Vossen99]_ and it came to fruition with the release of the Collaborative Interlingual Index (CILI; [Bond16]_). CILI therefore represents an instance of, and a namespace for, ILIs. There could, in theory, be alternative indexes for particular domains (e.g., names of people or places), but currently there is only the one. As an example, the synset for *apricot* (fruit) in WordNet 3.0 is ``07750872-n``, but it is ``07766848-n`` in WordNet 3.1. In OdeNet 1.4, which is not released in the WNDB format and therefore doesn't use offsets at all, it is ``13235-n`` for the equivalent word (*Aprikose*). However, all three use the same ILI: ``i77784``. Generally, only one synset within a wordnet will be mapped to a particular ILI, but this may not always be true, nor does every synset necessarily map to an ILI. Some concepts that are lexicalized in one language may not be in another language. For example, *rice* in English may refer to the rice plant, rice grain, or cooked rice, but in languages like Japanese they are distinct things (稲 *ine*, 米 *kome*, and 飯 *meshi* / ご飯 *gohan*, respectively). The ``ili`` property of Synsets serves two purposes in Wn. Mainly it is for encoding the ILI identifier associated with the synset, but it is also used to indicate when a lexicon is proposing a new concept that is not yet part of CILI. In the latter case, a WN-LMF lexicon file will have the special value of ``in`` for a synset's ILI and it will provide an ```` element. In Wn, this translates to :attr:`wn.Synset.ili` returning :python:`None`, the same as if no ILI were mapped at all. Both synsets with proposed ILIs and those with no ILI cannot be used in interlingual queries. Proposed ILIs can be inspected using the :mod:`wn.ili.get_proposed` function, if you know have the synset, or :mod:`wn.ili.get_all_proposed` to get all of them. .. [Vossen99] Vossen, Piek, Wim Peters, and Julio Gonzalo. "Towards a universal index of meaning." In Proceedings of ACL-99 workshop, Siglex-99, standardizing lexical resources, pp. 81-90. University of Maryland, 1999. .. [Bond16] Bond, Francis, Piek Vossen, John Philip McCrae, and Christiane Fellbaum. "CILI: the Collaborative Interlingual Index." In Proceedings of the 8th Global WordNet Conference (GWC), pp. 50-57. 2016. Using Interlingual Indices -------------------------- For synsets that have an associated ILI, you can retrieve it via the :data:`wn.Synset.ili` property: >>> apricot = en.synsets('apricot')[1] >>> apricot.ili 'i77784' The value is a :class:`str` ILI identifier. These may be used directly for things like interlingual synset lookups: >>> de.synsets(ili=apricot.ili)[0].lemmas() ['Marille', 'Aprikose'] There may be more information about the ILI itself which you can get from the :mod:`wn.ili` module: >>> from wn import ili >>> apricot_ili = ili.get(apricot.ili) >>> apricot_ili ILI(id='i77784') From this object you can get various properties of the ILI, such as the ID string, its status, and its definition, but if you have not added CILI to Wn's database, it will not be very informative: >>> apricot_ili.id 'i77784' >>> apricot_ili.status 'presupposed' >>> apricot_ili.definition() is None True The ``presupposed`` status means that the ILI ID is in use by a lexicon, but there is no other source of truth for the index. CILI can be downloaded just like a lexicon: >>> wn.download('cili:1.0') Now the status and definition should be more useful: >>> apricot_ili.status 'active' >>> apricot_ili.definition() 'downy yellow to rosy-colored fruit resembling a small peach' Translating Words, Senses, and Synsets -------------------------------------- Rather than manually inserting the ILI IDs into Wn's lookup functions as shown above, Wn provides the :meth:`wn.Synset.translate` method to make it easier: >>> apricot.translate(lexicon='odenet:1.4') [Synset('odenet-13235-n')] The method returns a list for two reasons: first, it's not guaranteed that the target lexicon has only one synset with the ILI and, second, you can translate to more than one lexicon at a time. :class:`~wn.Sense` objects also have a :meth:`~wn.Sense.translate` method, returning a list of senses instead of synsets: >>> de_senses = apricot.senses()[0].translate(lexicon='odenet:1.4') >>> [s.word().lemma() for s in de_senses] ['Marille', 'Aprikose'] :class:`~wn.Word` have a :meth:`~wn.Word.translate` method, too, but it works a bit differently. Since each word may be part of multiple synsets, the method returns a mapping of each word sense to the list of translated words: >>> result = en.words('apricot')[0].translate(lexicon='odenet:1.4') >>> for sense, de_words in result.items(): ... print(sense, [w.lemma() for w in de_words]) ... Sense('oewn-apricot__1.20.00..') [] Sense('oewn-apricot__1.13.00..') ['Marille', 'Aprikose'] Sense('oewn-apricot__1.07.00..') ['lachsrosa', 'lachsfarbig', 'in Lachs', 'lachsfarben', 'lachsrot', 'lachs'] The three senses above are for *apricot* as a tree, a fruit, and a color. OdeNet does not have a synset for apricot trees, or it has one not associated with the appropriate ILI, and therefore it could not translate any words for that sense. .. _cross-lingual-relation-traversal: Cross-lingual Relation Traversal -------------------------------- ILIs have a second use in Wn, which is relation traversal for wordnets that depend on other lexicons, i.e., those created with the *expand* methodology. These wordnets, such as many of those in the `Open Multilingual Wordnet `_, do not include synset relations on their own as they were built using the English WordNet as their taxonomic scaffolding. Trying to load such a lexicon when the lexicon it requires is not added to the database presents a warning to the user: >>> ja = wn.Wordnet('omw-ja:1.4') [...] WnWarning: lexicon dependencies not available: omw-en:1.4 >>> ja.expanded_lexicons() [] .. warning:: Do not rely on the presence of a warning to determine if the lexicon has its expand lexicon loaded. Python's default warning filter may only show the warning the first time it is encountered. Instead, inspect :meth:`wn.Wordnet.expanded_lexicons` to see if it is non-empty. When a dependency is unmet, Wn only issues a warning, not an error, and you can continue to use the lexicon as it is, but it won't be useful for exploring relations such as hypernyms and hyponyms: >>> anzu = ja.synsets(ili='i77784')[0] >>> anzu.lemmas() ['アンズ', 'アプリコット', '杏'] >>> anzu.hypernyms() [] One way to resolve this issue is to install the lexicon it requires: >>> wn.download('omw-en:1.4') >>> ja = wn.Wordnet('omw-ja:1.4') # no warning >>> ja.expanded_lexicons() [] Wn will detect the dependency and load ``omw-en:1.4`` as the *expand* lexicon for ``omw-ja:1.4`` when the former is in the database. You may also specify an expand lexicon manually, even one that isn't the specified dependency: >>> ja = wn.Wordnet('omw-ja:1.4', expand='oewn:2024') # no warning >>> ja.expanded_lexicons() [] In this case, the Open English WordNet is an actively-developed fork of the lexicon that ``omw-ja:1.4`` depends on, and it should contain all the relations, so you'll see little difference between using it and ``omw-en:1.4``. This works because the relations are found using ILIs and not synset offsets. You may still prefer to use the specified dependency if you have strict compatibility needs, such as for experiment reproducibility and/or compatibility with the `NLTK `_. Using some other lexicon as the expand lexicon may yield very different results. For instance, ``odenet:1.4`` is much smaller than the English wordnets and has fewer relations, so it would not be a good substitute for ``omw-ja:1.4``'s expand lexicon. When an appropriate expand lexicon is loaded, relations between synsets, such as hypernyms, are more likely to be present: >>> anzu = ja.synsets(ili='i77784')[0] # recreate the synset object >>> anzu.hypernyms() [Synset('omw-ja-07705931-n')] >>> anzu.hypernyms()[0].lemmas() ['果物'] >>> anzu.hypernyms()[0].translate(lexicon='oewn:2024')[0].lemmas() ['edible fruit'] wn-1.0.0/docs/guides/lemmatization.rst000066400000000000000000000230401513755206300177610ustar00rootroot00000000000000 Lemmatization and Normalization =============================== Wn provides two methods for expanding queries: lemmatization_ and normalization_\ . Wn also has a setting that allows `alternative forms `_ stored in the database to be included in queries. .. seealso:: The :mod:`wn.morphy` module is a basic English lemmatizer included with Wn. .. _lemmatization: Lemmatization ------------- When querying a wordnet with wordforms from natural language text, it is important to be able to find entries for inflected forms as the database generally contains only lemmatic forms, or *lemmas* (or *lemmata*, if you prefer irregular plurals). >>> import wn >>> en = wn.Wordnet('oewn:2021') >>> en.words('plurals') [] >>> en.words('plural') [Word('oewn-plural-a'), Word('oewn-plural-n')] Lemmas are sometimes called *citation forms* or *dictionary forms* as they are often used as the head words in dictionary entries. In Natural Language Processing (NLP), *lemmatization* is a technique where a possibly inflected word form is transformed to yield a lemma. In Wn, this concept is generalized somewhat to mean a transformation that yields a form matching wordforms stored in the database. For example, the English word *sparrows* is the plural inflection of *sparrow*, while the word *leaves* is ambiguous between the plural inflection of the nouns *leaf* and *leave* and the 3rd-person singular inflection of the verb *leave*. For tasks where high-accuracy is needed, wrapping the wordnet queries with external tools that handle tokenization, lemmatization, and part-of-speech tagging will likely yield the best results as this method can make use of word context. That is, something like this: .. code-block:: python for lemma, pos in fancy_shmancy_analysis(corpus): synsets = w.synsets(lemma, pos=pos) For modest needs, however, Wn provides a way to integrate basic lemmatization directly into the queries. Lemmatization in Wn works as follows: if a :class:`wn.Wordnet` object is instantiated with a *lemmatizer* argument, then queries involving wordforms (e.g., :meth:`wn.Wordnet.words`, :meth:`wn.Wordnet.senses`, :meth:`wn.Wordnet.synsets`) will first lemmatize the wordform and then check all resulting wordforms and parts of speech against the database as successive queries. Lemmatization Functions ''''''''''''''''''''''' The *lemmatizer* argument of :class:`wn.Wordnet` is a callable that takes two string arguments: (1) the original wordform, and (2) a part-of-speech or :python:`None`. It returns a dictionary mapping parts-of-speech to sets of lemmatized wordforms. The signature is as follows: .. code-block:: python lemmatizer(s: str, pos: str | None) -> Dict[str | None, Set[str]] The part-of-speech may be used by the function to determine which morphological rules to apply. If the given part-of-speech is :python:`None`, then it is not specified and any rule may apply. A lemmatizer that only deinflects should not change any specified part-of-speech, but this is not a requirement, and a function could be provided that undoes derivational morphology (e.g., *democratic* → *democracy*). Querying With Lemmatization ''''''''''''''''''''''''''' As the needs of lemmatization differs from one language to another, Wn does not provide a lemmatizer by default, and therefore it is unavailable to the convenience functions :func:`wn.words`, :func:`wn.senses`, and :func:`wn.synsets`. A lemmatizer can be added to a :class:`wn.Wordnet` object. For example, using :mod:`wn.morphy`: >>> import wn >>> from wn.morphy import Morphy >>> en = wn.Wordnet('oewn:2021', lemmatizer=Morphy()) >>> en.words('sparrows') [Word('oewn-sparrow-n')] >>> en.words('leaves') [Word('oewn-leave-v'), Word('oewn-leaf-n'), Word('oewn-leave-n')] Querying Without Lemmatization '''''''''''''''''''''''''''''' When lemmatization is not used, inflected terms may not return any results: >>> en = wn.Wordnet('oewn:2021') >>> en.words('sparrows') [] Depending on the lexicon, there may be situations where results are returned for inflected lemmas, such as when the inflected form is lexicalized as its own entry: >>> en.words('glasses') [Word('oewn-glasses-n')] Or if the lexicon lists the inflected form as an alternative form. For example, the English Wordnet lists irregular inflections as alternative forms: >>> en.words('lemmata') [Word('oewn-lemma-n')] See below for excluding alternative forms from such queries. .. _alternative-forms: Alternative Forms in the Database --------------------------------- A lexicon may include alternative forms in addition to lemmas for each word, and by default these are included in queries. What exactly is included as an alternative form depends on the lexicon. The English Wordnet, for example, adds irregular inflections (or "exceptional forms"), while the Japanese Wordnet includes the same word in multiple orthographies (original, hiragana, katakana, and two romanizations). For the English Wordnet, this means that you might get basic lemmatization for irregular forms only: >>> en = wn.Wordnet('oewn:2021') >>> en.words('learnt', pos='v') [Word('oewn-learn-v')] >>> en.words('learned', pos='v') [] If this is undesirable, the alternative forms can be excluded from queries with the *search_all_forms* parameter: >>> en = wn.Wordnet('oewn:2021', search_all_forms=False) >>> en.words('learnt', pos='v') [] >>> en.words('learned', pos='v') [] .. _normalization: Normalization ------------- While lemmatization deals with morphological variants of words, normalization handles minor orthographic variants. Normalized forms, however, may be invalid as wordforms in the target language, and as such they are only used behind the scenes for query expansion and not presented to users. For instance, a user might attempt to look up *résumé* in the English wordnet, but the wordnet only contains the form without diacritics: *resume*. With strict string matching, the entry would not be found using the wordform in the query. By normalizing the query word, the entry can be found. Similarly in the Spanish wordnet, *soñar* (to dream) and *sonar* (to ring) are two different words. A user who types *soñar* likely does not want to get results for *sonar*, but one who types *sonar* may be a non-Spanish speaker who is unaware of the missing diacritic or does not have an input method that allows them to type the diacritic, so this query would return both entries by matching against the normalized forms in the database. Wn handles all of these use cases. When a lexicon is added to the database, potentially two wordforms are inserted for every one in the lexicon: the original wordform and a normalized form. When querying against the database, the original query string is first compared with the original wordforms and, if normalization is enabled, with the normalized forms in the database as well. If this first attempt yields no results and if normalization is enabled, the query string is normalized and tried again. Normalization Functions ''''''''''''''''''''''' The normalized form is obtained from a *normalizer* function, passed as an argument to :class:`wn.Wordnet`, that takes a single string argument and returns a string. That is, a function with the following signature: .. code-block:: python normalizer(s: str) -> str While custom *normalizer* functions could be used, in practice the choice is either the default normalizer or :python:`None`. The default normalizer works by downcasing the string and performing NFKD_ normalization to remove diacritics. If the normalized form is the same as the original, only the original is inserted into the database. .. table:: Examples of normalization :align: center ============= =============== Original Form Normalized Form ============= =============== résumé resume soñar sonar San José san jose ハラペーニョ ハラヘーニョ ============= =============== .. _NFKD: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms Querying With Normalization ''''''''''''''''''''''''''' By default, normalization is enabled when a :class:`wn.Wordnet` is created. Enabling normalization does two things: it allows queries to check the original wordform in the query against the normalized forms in the database and, if no results are returned in the first step, it allows the queried wordform to be normalized as a back-off technique. >>> en = wn.Wordnet('oewn:2021') >>> en.words('résumé') [Word('oewn-resume-n'), Word('oewn-resume-v')] >>> es = wn.Wordnet('omw-es:1.4') >>> es.words('soñar') [Word('omw-es-soñar-v')] >>> es.words('sonar') [Word('omw-es-sonar-v'), Word('omw-es-soñar-v')] .. note:: Users may supply a custom *normalizer* function to the :class:`wn.Wordnet` object, but currently this is discouraged as the result is unlikely to match normalized forms in the database and there is not yet a way to customize the normalization of forms added to the database. Querying Without Normalization '''''''''''''''''''''''''''''' Normalization can be disabled by passing :python:`None` as the argument of the *normalizer* parameter of :class:`wn.Wordnet`. The queried wordform will not be checked against normalized forms in the database and neither will it be normalized as a back-off technique. >>> en = wn.Wordnet('oewn:2021', normalizer=None) >>> en.words('résumé') [] >>> es = wn.Wordnet('omw-es:1.4', normalizer=None) >>> es.words('soñar') [Word('omw-es-soñar-v')] >>> es.words('sonar') [Word('omw-es-sonar-v')] .. note:: It is not possible to disable normalization for the convenience functions :func:`wn.words`, :func:`wn.senses`, and :func:`wn.synsets`. wn-1.0.0/docs/guides/lexicons.rst000066400000000000000000000226631513755206300167420ustar00rootroot00000000000000Working with Lexicons ===================== Terminology ----------- In Wn, the following terminology is used: :lexicon: An inventory of words, senses, synsets, relations, etc. that share a namespace (i.e., that can refer to each other). :wordnet: A group of lexicons (but usually just one). :resource: A file containing lexicons. :package: A directory containing a resource and optionally some metadata files. :collection: A directory containing packages and optionally some metadata files. :project: A general term for a resource, package, or collection, particularly pertaining to its creation, maintenance, and distribution. In general, each resource contains one lexicon. For large projects like the `Open English WordNet`_, that lexicon is also a wordnet on its own. For a collection like the `Open Multilingual Wordnet`_, most lexicons do not include relations as they are instead expected to use those from the OMW's included English wordnet, which is derived from the `Princeton WordNet`_. As such, a wordnet for these sub-projects is best thought of as the grouping of the lexicon with the lexicon providing the relations. .. _Open English WordNet: https://en-word.net .. _Open Multilingual Wordnet: https://github.com/omwn/ .. _Princeton WordNet: https://wordnet.princeton.edu/ .. _lexicon-specifiers: Lexicon and Project Specifiers ------------------------------ Wn uses *lexicon specifiers* to deal with the possibility of having multiple lexicons and multiple versions of lexicons loaded in the same database. The specifiers are the joining of a lexicon's name (ID) and version, delimited by ``:``. Here are the possible forms: .. code-block:: none * -- any/all lexicons id -- the most recently added lexicon with the given id id:* -- all lexicons with the given id id:version -- the lexicon with the given id and version *:version -- all lexicons with the given version For example, if ``ewn:2020`` was installed followed by ``ewn:2019``, then ``ewn`` would specify the ``2019`` version, ``ewn:*`` would specify both versions, and ``ewn:2020`` would specify the ``2020`` version. The same format is used for *project specifiers*, which refer to projects as defined in Wn's index. In most cases the project specifier is the same as the lexicon specifier (e.g., ``ewn:2020`` refers both to the project to be downloaded and the lexicon that is installed), but sometimes it is not. The 1.4 release of the `Open Multilingual Wordnet`_, for instance, has the project specifier ``omw:1.4`` but it installs a number of lexicons with their own lexicon specifiers (``omw-zsm:1.4``, ``omw-cmn:1.4``, etc.). When only an id is given (e.g., ``ewn``), a project specifier gets the *first* version listed in the index (in the default index, conventionally, the first version is the latest release). .. _lexicon-filters: Filtering Queries with Lexicons ------------------------------- Queries against the database will search all installed lexicons unless they are filtered by ``lang`` or ``lexicon`` arguments: >>> import wn >>> len(wn.words()) 1538449 >>> len(wn.words(lang="en")) 318289 >>> len(wn.words(lexicon="oewn:2024")) 161705 The ``lexicon`` parameter can also take multiple specifiers so you can include things like lexicon extensions or to explicitly include multiple lexicons: >>> len(wn.words(lexicon="oewn:2024 omw-en:1.4")) 318289 If a lexicon selected by the ``lexicon`` or ``lang`` arguments specifies a dependency, the dependency is automatically added as an *expand* lexicon. Explicitly set :python:`expand=''` to disable this behavior: >>> wn.lexicons(lexicon="omw-es:1.4")[0].requires() # omw-es requires omw-en {'omw-en:1.4': } >>> es = wn.Wordnet("omw-es:1.4") >>> es.lexicons() [] >>> es.expanded_lexicons() # omw-en automatically added [] >>> es_no_en = wn.Wordnet("omw-es:1.4", expand='') >>> es_no_en.lexicons() [] >>> es_no_en.expanded_lexicons() # no expand lexicons [] Also see :ref:`cross-lingual-relation-traversal` for selecting expand lexicons for relations. The objects returned by queries retain the "lexicon configuration" used, which includes the lexicons and expand lexicons. This configuration determines which lexicons are searched during secondary queries. The lexicon configuration also stores a flag indicating whether no lexicon filters were used at all, which triggers :ref:`default mode ` secondary queries. .. _default-mode: Default Mode Queries -------------------- A special "default mode" is activated when making a module-function query (:func:`wn.words`, :func:`wn.synsets`, etc.) or instantiating a :class:`wn.Wordnet` object with no ``lexicon`` or ``lang`` argument (so-named because the mode is triggered by using the default values of ``lexicon`` and ``lang``): >>> w = wn.Wordnet() >>> wn.words("pineapple") # for example Default-mode causes the following behavior: 1. Primary queries search any installed lexicon 2. Secondary queries only search the lexicon of the primary entity (e.g., :meth:`Synset.words` only finds words from the same lexicon as the synset). If the lexicon has any extensions or is itself an extension, any extension/base lexicons are also included. 3. If the ``expand`` argument is :python:`None` (always true for module functions like :func:`wn.synsets`), all installed lexicons are used as expand lexicons for relations queries. .. warning:: Default-mode queries are not reproducible as the results can change as lexicons are added or removed from the database. For anything more than a casual query, it is highly suggested to instead create a :class:`wn.Wordnet` object with fully-specified ``lexicon`` and ``expand`` arguments. Downloading Lexicons -------------------- Use :py:func:`wn.download` to download lexicons from the web given either an indexed project specifier or the URL of a resource, package, or collection. >>> import wn >>> wn.download('odenet') # get the latest Open German WordNet >>> wn.download('odenet:1.3') # get the 1.3 version >>> # download from a URL >>> wn.download('https://github.com/omwn/omw-data/releases/download/v1.4/omw-1.4.tar.xz') The project specifier is only used to retrieve information from Wn's index. The lexicon IDs of the corresponding resource files are what is stored in the database. Adding Local Lexicons --------------------- Lexicons can be added from local files with :py:func:`wn.add`: >>> wn.add('~/data/omw-1.4/omw-nb/omw-nb.xml') Or with the parent directory as a package: >>> wn.add('~/data/omw-1.4/omw-nb/') Or with the grandparent directory as a collection (installing all packages contained by the collection): >>> wn.add('~/data/omw-1.4/') Or from a compressed archive of one of the above: >>> wn.add('~/data/omw-1.4/omw-nb/omw-nb.xml.xz') >>> wn.add('~/data/omw-1.4/omw-nb.tar.xz') >>> wn.add('~/data/omw-1.4.tar.xz') Listing Installed Lexicons -------------------------- If you wish to see which lexicons have been added to the database, :py:func:`wn.lexicons()` returns the list of :py:class:`wn.Lexicon` objects that describe each one. >>> for lex in wn.lexicons(): ... print(f'{lex.id}:{lex.version}\t{lex.label}') ... omw-en:1.4 OMW English Wordnet based on WordNet 3.0 omw-nb:1.4 Norwegian Wordnet (Bokmål) odenet:1.3 Offenes Deutsches WordNet ewn:2020 English WordNet ewn:2019 English WordNet Removing Lexicons ----------------- Lexicons can be removed from the database with :py:func:`wn.remove`: >>> wn.remove('omw-nb:1.4') Note that this removes a single lexicon and not a project, so if, for instance, you've installed a multi-lexicon project like ``omw``, you will need to remove each lexicon individually or use a star specifier: >>> wn.remove('omw-*:1.4') WN-LMF Files, Packages, and Collections --------------------------------------- Wn can handle projects with 3 levels of structure: * WN-LMF XML files * WN-LMF packages * WN-LMF collections WN-LMF XML Files '''''''''''''''' A WN-LMF XML file is a file with a ``.xml`` extension that is valid according to the `WN-LMF specification `_. WN-LMF Packages ''''''''''''''' If one needs to distribute metadata or additional files along with WN-LMF XML file, a WN-LMF package allows them to include the files in a directory. The directory should contain exactly one ``.xml`` file, which is the WN-LMF XML file. In addition, it may contain additional files and Wn will recognize three of them: :``LICENSE`` (``.txt`` | ``.md`` | ``.rst`` ): the full text of the license :``README`` (``.txt`` | ``.md`` | ``.rst`` ): the project README :``citation.bib``: a BibTeX file containing academic citations for the project .. code-block:: omw-sq/ ├── omw-sq.xml ├── LICENSE.txt └── README.md WN-LMF Collections '''''''''''''''''' In some cases a project may manage multiple resources and distribute them as a collection. A collection is a directory containing subdirectories which are WN-LMF packages. The collection may contain its own README, LICENSE, and citation files which describe the project as a whole. .. code-block:: omw-1.4/ ├── omw-sq │   ├── oms-sq.xml │   ├── LICENSE.txt │   └── README.md ├── omw-lt │   ├── citation.bib │   ├── LICENSE │   └── omw-lt.xml ├── ... ├── citation.bib ├── LICENSE └── README.md wn-1.0.0/docs/guides/nltk-migration.rst000066400000000000000000000104221513755206300200430ustar00rootroot00000000000000Migrating from the NLTK ======================= This guide is for users of the `NLTK `_\ 's ``nltk.corpus.wordnet`` module who are migrating to Wn. It is not guaranteed that Wn will produce the same results as the NLTK's module, but with some care its behavior can be very similar. Overview -------- One important thing to note is that Wn will search all wordnets in the database by default where the NLTK would only search the English. >>> from nltk.corpus import wordnet as nltk_wn >>> nltk_wn.synsets('chat') # only English >>> nltk_wn.synsets('chat', lang='fra') # only French >>> import wn >>> wn.synsets('chat') # all wordnets >>> wn.synsets('chat', lang='fr') # only French With Wn it helps to create a :class:`wn.Wordnet` object to pre-filter the results by language or lexicon. >>> en = wn.Wordnet('omw-en:1.4') >>> en.synsets('chat') # only the OMW English Wordnet Equivalent Operations --------------------- The following table lists equivalent API calls for the NLTK's wordnet module and Wn assuming the respective modules have been instantiated (in separate Python sessions) as follows: NLTK: >>> from nltk.corpus import wordnet as wn >>> ss = wn.synsets("chat", pos="v")[0] Wn: >>> import wn >>> en = wn.Wordnet('omw-en:1.4') >>> ss = en.synsets("chat", pos="v")[0] .. default-role:: python Primary Queries ''''''''''''''' ========================================= =============================================== NLTK Wn ========================================= =============================================== `wn.langs()` `[lex.language for lex in wn.lexicons()]` `wn.lemmas("chat")` -- -- `en.words("chat")` -- `en.senses("chat")` `wn.synsets("chat")` `en.synsets("chat")` `wn.synsets("chat", pos="v")` `en.synsets("chat", pos="v")` `wn.all_synsets()` `en.synsets()` `wn.all_synsets(pos="v")` `en.synsets(pos="v")` ========================================= =============================================== Synsets -- Basic '''''''''''''''' =================== ================= NLTK Wn =================== ================= `ss.lemmas()` -- -- `ss.senses()` -- `ss.words()` `ss.lemmas_names()` `ss.lemmas()` `ss.definition()` `ss.definition()` `ss.examples()` `ss.examples()` `ss.pos()` `ss.pos` =================== ================= Synsets -- Relations '''''''''''''''''''' ========================================== ===================================== NLTK Wn ========================================== ===================================== `ss.hypernyms()` `ss.get_related("hypernym")` `ss.instance_hypernyms()` `ss.get_related("instance_hypernym")` `ss.hypernyms() + ss.instance_hypernyms()` `ss.hypernyms()` `ss.hyponyms()` `ss.get_related("hyponym")` `ss.member_holonyms()` `ss.get_related("holo_member")` `ss.member_meronyms()` `ss.get_related("mero_member")` `ss.closure(lambda x: x.hypernyms())` `ss.closure("hypernym")` ========================================== ===================================== Synsets -- Taxonomic Structure '''''''''''''''''''''''''''''' ================================ ========================================================= NLTK Wn ================================ ========================================================= `ss.min_depth()` `ss.min_depth()` `ss.max_depth()` `ss.max_depth()` `ss.hypernym_paths()` `[list(reversed([ss] + p)) for p in ss.hypernym_paths()]` `ss.common_hypernyms(ss)` `ss.common_hypernyms(ss)` `ss.lowest_common_hypernyms(ss)` `ss.lowest_common_hypernyms(ss)` `ss.shortest_path_distance(ss)` `len(ss.shortest_path(ss))` ================================ ========================================================= .. reset default role .. default-role:: (these tables are incomplete) wn-1.0.0/docs/guides/wordnet.rst000066400000000000000000000134351513755206300165750ustar00rootroot00000000000000.. raw:: html The Structure of a Wordnet ========================== A **wordnet** is an online lexicon which is organized by concepts. The basic unit of a wordnet is the synonym set (**synset**), a group of words that all refer to the same concept. Words and synsets are linked by means of conceptual-semantic relations to form the structure of wordnet. Words, Senses, and Synsets -------------------------- We all know that **words** are the basic building blocks of languages, a word is built up with two parts, its form and its meaning, but in natural languages, the word form and word meaning are not in an elegant one-to-one match, one word form may connect to many different meanings, so hereforth, we need **senses**, to work as the unit of word meanings, for example, the word *bank* has at least two senses: 1. bank\ :sup:`1`\: financial institution, like *City Bank*; 2. bank\ :sup:`2`\: sloping land, like *river bank*; Since **synsets** are group of words sharing the same concept, bank\ :sup:`1`\ and bank\ :sup:`2`\ are members of two different synsets, although they have the same word form. On the other hand, different word forms may also convey the same concept, such as *cab* and *taxi*, these word forms with the same concept are grouped together into one synset. .. raw:: html :file: images/word-sense-synset.svg .. role:: center :class: center :center:`Figure: relations between words, senses and synsets` Synset Relations ---------------- In wordnet, synsets are linked with each other to form various kinds of relations. For example, if the concept expressed by a synset is more general than a given synset, then it is in a *hypernym* relation with the given synset. As shown in the figure below, the synset with *car*, *auto* and *automobile* as its member is the *hypernym* of the other synset with *cab*, *taxi* and *hack*. Such relation which is built on the synset level is categorized as synset relations. .. raw:: html :file: images/synset-synset.svg :center:`Figure: example of synset relations` Sense Relations --------------- Some relations in wordnet are also built on sense level, which can be further divided into two types, relations that link sense with another sense, and relations that link sense with another synset. .. note:: In wordnet, synset relation and sense relation can both employ a particular relation type, such as `domain topic `_. **Sense-Sense** Sense to sense relations emphasize the connections between different senses, especially when dealing with morphologically related words. For example, *behavioral* is the adjective to the noun *behavior*, which is known as in the *pertainym* relation with *behavior*, however, such relation doesn't exist between *behavioral* and *conduct*, which is a synonym of *behavior* and is in the same synset. Here *pertainym* is a sense-sense relation. .. raw:: html :file: images/sense-sense.svg :center:`Figure: example of sense-sense relations` **Sense-Synset** Sense-synset relations connect a particular sense with a synset. For example, *cursor* is a term in the *computer science* discipline, in wordnet, it is in the *has domain topic* relation with the *computer science* synset, but *pointer*, which is in the same synset with *cursor*, is not a term, thus has no such relation with *computer science* synset. .. raw:: html :file: images/sense-synset.svg :center:`Figure: example of sense-synset relations` Other Information ----------------- A wordnet should be built in an appropriate form, two schemas are accepted: * XML schema based on the Lexical Markup Framework (LMF) * JSON-LD using the Lexicon Model for Ontologies The structure of a wordnet should contain below info: **Definition** Definition is used to define senses and synsets in a wordnet, it is given in the language of the wordnet it came from. **Example** Example is used to clarify the senses and synsets in a wordnet, users can understand the definition more clearly with a given example. **Metadata** A wordnet has its own metadata, based on the `Dublin Core `_, to state the basic info of it, below table lists all the items in the metadata of a wordnet: +------------------+-----------+-----------+ | contributor | Optional | str | +------------------+-----------+-----------+ | coverage | Optional | str | +------------------+-----------+-----------+ | creator | Optional | str | +------------------+-----------+-----------+ | date | Optional | str | +------------------+-----------+-----------+ | description | Optional | str | +------------------+-----------+-----------+ | format | Optional | str | +------------------+-----------+-----------+ | identifier | Optional | str | +------------------+-----------+-----------+ | publisher | Optional | str | +------------------+-----------+-----------+ | relation | Optional | str | +------------------+-----------+-----------+ | rights | Optional | str | +------------------+-----------+-----------+ | source | Optional | str | +------------------+-----------+-----------+ | subject | Optional | str | +------------------+-----------+-----------+ | title | Optional | str | +------------------+-----------+-----------+ | type | Optional | str | +------------------+-----------+-----------+ | status | Optional | str | +------------------+-----------+-----------+ | note | Optional | str | +------------------+-----------+-----------+ | confidence | Optional | float | +------------------+-----------+-----------+wn-1.0.0/docs/index.rst000066400000000000000000000024751513755206300147440ustar00rootroot00000000000000 Wn Documentation ================ Overview -------- This package provides an interface to wordnet data, from simple lookup queries, to graph traversals, to more sophisticated algorithms and metrics. Features include: - Support for wordnets in the `WN-LMF `_ format - A `SQLite `_ database backend for data consistency and efficient queries - Accurate modeling of Words, Senses, and Synsets Quick Start ----------- .. code-block:: console $ pip install wn .. code-block:: python >>> import wn >>> wn.download('ewn:2020') >>> wn.synsets('coffee') [Synset('ewn-04979718-n'), Synset('ewn-07945591-n'), Synset('ewn-07945759-n'), Synset('ewn-12683533-n')] Contents -------- .. toctree:: :maxdepth: 2 setup.rst cli.rst faq.rst .. toctree:: :caption: Guides :maxdepth: 2 guides/lexicons.rst guides/basic.rst guides/interlingual.rst guides/wordnet.rst guides/lemmatization.rst guides/nltk-migration.rst .. toctree:: :caption: API Reference :maxdepth: 1 :hidden: api/wn.rst api/wn.compat.rst api/wn.constants.rst api/wn.ic.rst api/wn.ili.rst api/wn.lmf.rst api/wn.morphy.rst api/wn.project.rst api/wn.similarity.rst api/wn.taxonomy.rst api/wn.util.rst api/wn.validate.rst wn-1.0.0/docs/make.bat000066400000000000000000000014331513755206300145010ustar00rootroot00000000000000@ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd wn-1.0.0/docs/requirements.txt000066400000000000000000000000751513755206300163610ustar00rootroot00000000000000sphinx ~= 8.1 furo == 2024.8.6 sphinx-copybutton == 0.5.2 . wn-1.0.0/docs/setup.rst000066400000000000000000000102221513755206300147620ustar00rootroot00000000000000Installation and Configuration ============================== .. seealso:: This guide is for installing and configuring the Wn software. For adding lexicons to the database, see :doc:`guides/lexicons`. Installing from PyPI -------------------- Install the latest release from `PyPI `_: .. code-block:: bash pip install wn The Data Directory ------------------ By default, Wn stores its data (such as downloaded LMF files and the database file) in a ``.wn_data/`` directory under the user's home directory. This directory can be changed (see `Configuration`_ below). Whenever Wn attempts to download a resource or access its database, it will check for the existence of, and create if necessary, this directory, the ``.wn_data/downloads/`` subdirectory, and the ``.wn_data/wn.db`` database file. The file system will look like this:: .wn_data/ ├── downloads │   ├── ... │   └── ... └── wn.db The ``...`` entries in the ``downloads/`` subdirectory represent the files of resources downloaded from the web. Their filename is a hash of the URL so that Wn can avoid downloading the same file twice. Configuration ------------- The :py:data:`wn.config` object contains the paths Wn uses for local storage and information about resources available on the web. To change the directory Wn uses for storing data locally, modify the :python:`wn.config.data_directory` member: .. code-block:: python import wn wn.config.data_directory = '~/Projects/wn_data' There are some things to note: - The downloads directory and database path are always relative to the data directory and cannot be changed directly. - This change only affects subsequent operations, so any data in the previous location will not be moved nor deleted. - This change only affects the current session. If you want a script or application to always use the new location, it must reset the data directory each time it is initialized. You can also add project information for remote resources. First you add a project, with a project ID, full name, and language code. Then you create one or more versions for that project with a version ID, resource URL, and license information. This may be done either through the :py:data:`wn.config` object's :py:meth:`~wn._config.WNConfig.add_project` and :py:meth:`~wn._config.WNConfig.add_project_version` methods, or loaded from a TOML_ file via the :py:data:`wn.config` object's :py:meth:`~wn._config.WNConfig.load_index` method. .. _TOML: https://toml.io .. code-block:: python wn.config.add_project('ewn', 'English WordNet', 'en') wn.config.add_project_version( 'ewn', '2020', 'https://en-word.net/static/english-wordnet-2020.xml.gz', 'https://creativecommons.org/licenses/by/4.0/', ) Rebuilding the Database ----------------------- New versions of Wn may occasionally alter the database schema in a way that makes an existing database incompatible with the code. You will see an error like this (abbreviated): >>> import wn >>> wn.Wordnet("oewn:2024") Traceback (most recent call last): [...] wn.DatabaseError: Wn's schema has changed and is no longer compatible with the database. Lexicons currently installed: odenet:1.4 oewn:2023 oewn:2024 omw-arb:1.4 [...]] Run wn.reset_database(rebuild=True) to rebuild the database. You can then run, as directed, :func:`wn.reset_database` with ``rebuild=True``, which will delete the database, initialize a new one, and attempt to add all the lexicons that were previously added. You can also run with ``rebuild=False`` to reinitialize the database without re-adding lexicons, or alternatively simply delete the database file from your filesystem. See the documentation for :func:`wn.reset_database` for more information. Installing From Source ---------------------- If you wish to install the code from the source repository (e.g., to get an unreleased feature or to contribute toward Wn's development), clone the repository and use `Hatch `_ to start a virtual environment with Wn installed: .. code-block:: console $ git clone https://github.com/goodmami/wn.git $ cd wn $ hatch shell wn-1.0.0/pyproject.toml000066400000000000000000000047741513755206300150730ustar00rootroot00000000000000[build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] dynamic = ['version'] name = "wn" description = "Wordnet interface library" readme = "README.md" requires-python = ">=3.10" license = {file = "LICENSE"} keywords = ["wordnet", "interlingual", "linguistics", "language", "library"] authors = [ {name = "Michael Wayne Goodman", email = "goodman.m.w@gmail.com"} ] classifiers = [ "Development Status :: 4 - Beta", "Environment :: Console", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing :: Linguistic", ] dependencies = [ "httpx", "tomli; python_version < '3.11'", ] [project.optional-dependencies] editor = [ "wn-editor" ] [project.urls] homepage = "https://github.com/goodmami/wn" documentation = "https://wn.readthedocs.io" changelog = "https://github.com/goodmami/wn/blob/main/CHANGELOG.md" [tool.hatch.version] path = "wn/__init__.py" [tool.hatch.build.targets.sdist] exclude = [ "/.github", ] [tool.hatch.envs.hatch-test] extra-dependencies = [ "pytest-benchmark", ] [tool.hatch.envs.mypy] dependencies = [ "mypy", ] [tool.hatch.envs.mypy.scripts] check = "mypy wn/" [tool.hatch.envs.types] dependencies = [ "wn[dev]", "ty", ] [tool.hatch.envs.types.scripts] check = "ty check {args:wn/}" [tool.hatch.envs.docs] dependencies = [ "wn", "furo", "sphinx", "sphinx-copybutton", "sphinx-autobuild", ] [tool.hatch.envs.docs.scripts] build = "sphinx-build -M html docs docs/_build" clean = "sphinx-build -M clean docs docs/_build" watch = "sphinx-autobuild docs docs/_build/html" [tool.ruff] target-version = "py310" line-length = 88 [tool.ruff.lint] select = [ "B", # flake8-bugbear "C4", # comprehensions "C90", # McCabe cyclomatic complexity "E", # pycodestyle "F", # Pyflakes "I", # isort "LOG", # logging "PT", # pytest style "RUF", # ruff-specific fixes "SIM", # simplifications "TC", # type checking "UP", # newer python features "W", # pycodestyle ] wn-1.0.0/tests/000077500000000000000000000000001513755206300133055ustar00rootroot00000000000000wn-1.0.0/tests/_util_test.py000066400000000000000000000021071513755206300160320ustar00rootroot00000000000000from wn._util import flatten, normalize_form, unique_list def test_flatten(): assert flatten([]) == [] assert flatten([[]]) == [] assert flatten([[], []]) == [] assert flatten([[[], []], [[], []]]) == [[], [], [], []] assert flatten([[1]]) == [1] assert flatten([[1, 2], [3, 4]]) == [1, 2, 3, 4] assert flatten(["AB", "CD"]) == ["A", "B", "C", "D"] def test_unique_list(): assert unique_list([]) == [] assert unique_list([1]) == [1] assert unique_list([1, 1, 1, 1, 1]) == [1] assert unique_list([1, 1, 2, 2, 1]) == [1, 2] assert unique_list([2, 1, 2, 2, 1]) == [2, 1] assert unique_list("A") == ["A"] assert unique_list("AAA") == ["A"] assert unique_list("ABABA") == ["A", "B"] assert unique_list([(1, 2), (1, 2), (2, 3)]) == [(1, 2), (2, 3)] def test_normalize_form(): assert normalize_form("ABC") == "abc" assert normalize_form("so\xf1ar") == "sonar" # soñar with single ñ character assert normalize_form("son\u0303ar") == "sonar" # soñar with combining tilde assert normalize_form("Weiß") == "weiss" wn-1.0.0/tests/compat_sensekey_test.py000066400000000000000000000142301513755206300201070ustar00rootroot00000000000000import pytest import wn from wn.compat import sensekey def test_unescape_oewn_sense_key(): def unescape(s: str) -> str: return sensekey.unescape(s, flavor="oewn") assert unescape("") == "" assert unescape("abc") == "abc" assert unescape(".") == "." # only becomes : in second part of key # escape patterns assert unescape("-ap-") == "'" assert unescape("-ex-") == "!" assert unescape("-cm-") == "," assert unescape("-cn-") == ":" assert unescape("-pl-") == "+" assert unescape("-sl-") == "/" # adjacent escapes need their own dashes assert unescape("-ap-ex-") == "'ex-" assert unescape("-ap--ex-") == "'!" # invalid escapes are unchanged assert unescape("-foo-") == "-foo-" # not an escape sequence assert unescape("-sp-") == "-sp-" # not valid in lemma portion assert unescape("ap-") == "ap-" # no preceding dash assert unescape("-ap") == "-ap" # no trailing dash assert unescape("-AP-") == "-AP-" # case sensitivity # full key, second part escapes differently assert unescape("abc__1.23.00..") == "abc%1:23:00::" assert unescape("abc__1.23.00.foo-sp-bar.") == "abc%1:23:00:foo_bar:" assert unescape("abc__1.23.00.foo-ap-bar.") == "abc%1:23:00:foo-ap-bar:" def test_escape_oewn_sense_key(): def escape(s: str) -> str: return sensekey.escape(s, flavor="oewn") assert escape("") == "" assert escape("abc") == "abc" assert escape(".") == "." # only becomes : in second part of key # escape patterns assert escape("'") == "-ap-" assert escape("!") == "-ex-" assert escape(",") == "-cm-" assert escape(":") == "-cn-" assert escape("+") == "-pl-" assert escape("/") == "-sl-" # adjacent escapes need their own dashes assert escape("'!") == "-ap--ex-" # full key, second part escapes differently assert escape("abc%1:23:00::") == "abc__1.23.00.." assert escape("abc%1:23:00:foo_bar:") == "abc__1.23.00.foo-sp-bar." assert escape("abc%1:23:00:foo'bar:") == "abc__1.23.00.foo'bar." def test_unescape_oewn_v2_sense_key(): def unescape(s: str) -> str: return sensekey.unescape(s, flavor="oewn-v2") assert unescape("") == "" assert unescape("abc") == "abc" assert unescape(".") == "." # only becomes : in second part of key # escape patterns assert unescape("-apos-") == "'" assert unescape("-excl-") == "!" assert unescape("-comma-") == "," assert unescape("-colon-") == ":" assert unescape("-plus-") == "+" assert unescape("-sol-") == "/" assert unescape("--") == "-" # adjacent escapes need their own dashes assert unescape("-apos-excl-") == "'excl-" assert unescape("-apos--excl-") == "'!" # invalid escapes are unchanged assert unescape("-foo-") == "-foo-" # not an escape sequence assert unescape("-sp-") == "-sp-" # not valid in lemma portion assert unescape("ap-") == "ap-" # no preceding dash assert unescape("-ap") == "-ap" # no trailing dash assert unescape("-AP-") == "-AP-" # case sensitivity # full key, second part escapes differently assert unescape("abc__1.23.00..") == "abc%1:23:00::" assert unescape("abc__1.23.00.foo-sp-bar.") == "abc%1:23:00:foo_bar:" assert unescape("abc__1.23.00.foo-ap-bar.") == "abc%1:23:00:foo-ap-bar:" def test_escape_oewn_v2_sense_key(): def escape(s: str) -> str: return sensekey.escape(s, flavor="oewn-v2") assert escape("") == "" assert escape("abc") == "abc" assert escape(".") == "." # only becomes : in second part of key # escape patterns assert escape("'") == "-apos-" assert escape("!") == "-excl-" assert escape(",") == "-comma-" assert escape(":") == "-colon-" assert escape("+") == "-plus-" assert escape("/") == "-sol-" assert escape("-") == "--" # adjacent escapes need their own dashes assert escape("'!") == "-apos--excl-" # full key, second part escapes differently assert escape("abc%1:23:00::") == "abc__1.23.00.." assert escape("abc%1:23:00:foo_bar:") == "abc__1.23.00.foo-sp-bar." assert escape("abc%1:23:00:foo'bar:") == "abc__1.23.00.foo'bar." @pytest.mark.usefixtures("uninitialized_datadir") def test_sense_key_getter(datadir): wn.add(datadir / "sense-key-variations.xml") wn.add(datadir / "sense-key-variations2.xml") get_omw_sense_key = sensekey.sense_key_getter("omw-en:1.4") get_oewn2024_sense_key = sensekey.sense_key_getter("oewn:2024") get_oewn2025_sense_key = sensekey.sense_key_getter("oewn:2025") omw_sense = wn.sense("omw-en--apos-s_Gravenhage-08950407-n", lexicon="omw-en:1.4") oewn2024_sense = wn.sense("oewn--ap-s_gravenhage__1.15.00..", lexicon="oewn:2024") oewn2025_sense = wn.sense("oewn--apos-s_gravenhage__1.15.00..", lexicon="oewn:2025") assert get_omw_sense_key(omw_sense) == "'s_gravenhage%1:15:00::" assert get_omw_sense_key(oewn2024_sense) is None assert get_omw_sense_key(oewn2025_sense) is None assert get_oewn2024_sense_key(omw_sense) is None assert get_oewn2024_sense_key(oewn2024_sense) == "'s_gravenhage%1:15:00::" assert get_oewn2024_sense_key(oewn2025_sense) == "-apos-s_gravenhage%1:15:00::" assert get_oewn2025_sense_key(omw_sense) is None assert get_oewn2025_sense_key(oewn2024_sense) == "-ap-s_gravenhage%1:15:00::" assert get_oewn2025_sense_key(oewn2025_sense) == "'s_gravenhage%1:15:00::" @pytest.mark.usefixtures("uninitialized_datadir") def test_sense_getter(datadir): wn.add(datadir / "sense-key-variations.xml") wn.add(datadir / "sense-key-variations2.xml") get_omw_sense = sensekey.sense_getter("omw-en:1.4") get_oewn2024_sense = sensekey.sense_getter("oewn:2024") get_oewn2025_sense = sensekey.sense_getter("oewn:2025") omw_sense = wn.sense("omw-en--apos-s_Gravenhage-08950407-n", lexicon="omw-en:1.4") oewn2024_sense = wn.sense("oewn--ap-s_gravenhage__1.15.00..", lexicon="oewn:2024") oewn2025_sense = wn.sense("oewn--apos-s_gravenhage__1.15.00..", lexicon="oewn:2025") assert get_omw_sense("'s_gravenhage%1:15:00::") == omw_sense assert get_oewn2024_sense("'s_gravenhage%1:15:00::") == oewn2024_sense assert get_oewn2025_sense("'s_gravenhage%1:15:00::") == oewn2025_sense wn-1.0.0/tests/conftest.py000066400000000000000000000051751513755206300155140ustar00rootroot00000000000000import lzma from pathlib import Path import pytest import wn @pytest.fixture(scope="session") def datadir(): return Path(__file__).parent / "data" @pytest.fixture def uninitialized_datadir(monkeypatch, tmp_path: Path): with monkeypatch.context() as m: m.setattr(wn.config, "data_directory", tmp_path / "uninitialized_datadir") yield @pytest.fixture(scope="session") def empty_db(tmp_path_factory): dir = tmp_path_factory.mktemp("wn_data_empty") with pytest.MonkeyPatch.context() as m: m.setattr(wn.config, "data_directory", dir) yield # We want to build these DBs once per session, but connections # are created once for every test. @pytest.fixture(scope="session") def mini_db_dir(datadir, tmp_path_factory): dir = tmp_path_factory.mktemp("wn_data_mini") with pytest.MonkeyPatch.context() as m: m.setattr(wn.config, "data_directory", dir) wn.add(datadir / "mini-lmf-1.0.xml") wn.add(datadir / "mini-ili.tsv") wn._db.clear_connections() return Path(dir) @pytest.fixture def mini_lmf_compressed(datadir, tmp_path): data = (datadir / "mini-lmf-1.0.xml").read_bytes() path = tmp_path / "temp.xml.xz" with lzma.open(path, "w") as f: f.write(data) return Path(path) @pytest.fixture(scope="session") def mini_db_1_1_dir(datadir, tmp_path_factory): dir = tmp_path_factory.mktemp("wn_data_mini_1_1") with pytest.MonkeyPatch.context() as m: m.setattr(wn.config, "data_directory", dir) wn.add(datadir / "mini-lmf-1.0.xml") wn.add(datadir / "mini-lmf-1.1.xml") wn._db.clear_connections() return Path(dir) @pytest.fixture(scope="session") def mini_db_1_4_dir(datadir, tmp_path_factory): dir = tmp_path_factory.mktemp("wn_data_mini_1_4") with pytest.MonkeyPatch.context() as m: m.setattr(wn.config, "data_directory", dir) wn.add(datadir / "mini-lmf-1.0.xml") wn.add(datadir / "mini-lmf-1.4.xml") wn._db.clear_connections() return Path(dir) @pytest.fixture def mini_db(monkeypatch, mini_db_dir): with monkeypatch.context() as m: m.setattr(wn.config, "data_directory", mini_db_dir) yield wn._db.clear_connections() @pytest.fixture def mini_db_1_1(monkeypatch, mini_db_1_1_dir): with monkeypatch.context() as m: m.setattr(wn.config, "data_directory", mini_db_1_1_dir) yield wn._db.clear_connections() @pytest.fixture def mini_db_1_4(monkeypatch, mini_db_1_4_dir): with monkeypatch.context() as m: m.setattr(wn.config, "data_directory", mini_db_1_4_dir) yield wn._db.clear_connections() wn-1.0.0/tests/data/000077500000000000000000000000001513755206300142165ustar00rootroot00000000000000wn-1.0.0/tests/data/E101-0.xml000066400000000000000000000015751513755206300155130ustar00rootroot00000000000000 wn-1.0.0/tests/data/E101-1.xml000066400000000000000000000015031513755206300155030ustar00rootroot00000000000000 wn-1.0.0/tests/data/E101-2.xml000066400000000000000000000014131513755206300155040ustar00rootroot00000000000000 wn-1.0.0/tests/data/E101-3.xml000066400000000000000000000013311513755206300155040ustar00rootroot00000000000000 wn-1.0.0/tests/data/README.md000066400000000000000000000001421513755206300154720ustar00rootroot00000000000000# Testing Data Directory This directory is used to store data files used by the testing system. wn-1.0.0/tests/data/W305-0.xml000066400000000000000000000014111513755206300155300ustar00rootroot00000000000000 wn-1.0.0/tests/data/W306-0.xml000066400000000000000000000014001513755206300155270ustar00rootroot00000000000000 wn-1.0.0/tests/data/W307-0.xml000066400000000000000000000016611513755206300155410ustar00rootroot00000000000000 foo foo wn-1.0.0/tests/data/mini-ili-with-status.tsv000066400000000000000000000002111513755206300207470ustar00rootroot00000000000000ILI Definition Status i1 i1 definition active i2 deprecated i67447 knowledge acquired through study or experience or instruction active wn-1.0.0/tests/data/mini-ili.tsv000066400000000000000000000001501513755206300164570ustar00rootroot00000000000000ILI Definition i1 i1 definition i2 i67447 knowledge acquired through study or experience or instruction wn-1.0.0/tests/data/mini-lmf-1.0.xml000066400000000000000000000220011513755206300167370ustar00rootroot00000000000000 tag-text 3 something that informs "this is information" something that exemplifies "this is an example" providing an example a subset of exemplars from some population a sample randomly drawn from some population a sample that is random a measured or recorded piece of information to fire someone while making it look like it was their idea algo que informa "este es la información" algo que ejemplifica "este es el ejemplo" dar un ejemplo una muestra extraída aleatoriamente de alguna población wn-1.0.0/tests/data/mini-lmf-1.1.xml000066400000000000000000000135471513755206300167570ustar00rootroot00000000000000 tatoe ˌɪnfəˈmeɪʃən ˌɪnfɚˈmeɪʃən INF "the artist illustrated the story beautifully" depict something in a visual medium terminate employment wn-1.0.0/tests/data/mini-lmf-1.3.xml000066400000000000000000000031201513755206300167430ustar00rootroot00000000000000 one two three one two three one two three wn-1.0.0/tests/data/mini-lmf-1.4.xml000066400000000000000000000061771513755206300167630ustar00rootroot00000000000000 wn-1.0.0/tests/data/sense-key-variations.xml000066400000000000000000000027551513755206300210310ustar00rootroot00000000000000 wn-1.0.0/tests/data/sense-key-variations2.xml000066400000000000000000000014211513755206300211000ustar00rootroot00000000000000 wn-1.0.0/tests/data/sense-member-order.xml000066400000000000000000000022241513755206300204330ustar00rootroot00000000000000 wn-1.0.0/tests/data/test-package/000077500000000000000000000000001513755206300165665ustar00rootroot00000000000000wn-1.0.0/tests/data/test-package/LICENSE000066400000000000000000000000151513755206300175670ustar00rootroot00000000000000Test License wn-1.0.0/tests/data/test-package/README.md000066400000000000000000000000161513755206300200420ustar00rootroot00000000000000# Test README wn-1.0.0/tests/data/test-package/citation.bib000066400000000000000000000000131513755206300210500ustar00rootroot00000000000000% test bib wn-1.0.0/tests/data/test-package/test-wn.xml000066400000000000000000000002501513755206300207060ustar00rootroot00000000000000 wn-1.0.0/tests/db_test.py000066400000000000000000000051051513755206300153040ustar00rootroot00000000000000import sqlite3 import threading import pytest import wn from wn import lmf @pytest.mark.usefixtures("mini_db") def test_schema_compatibility(): conn = sqlite3.connect(str(wn.config.database_path)) schema_hash = wn._db.schema_hash(conn) assert schema_hash in wn._db.COMPATIBLE_SCHEMA_HASHES @pytest.mark.usefixtures("mini_db") def test_db_multithreading(): """ See https://github.com/goodmami/wn/issues/86 Thanks: @fushinari """ class WNThread: w = None def __init__(self): w_thread = threading.Thread(target=self.set_w) w_thread.start() w_thread.join() self.w.synsets() def set_w(self): if self.w is None: self.w = wn.Wordnet() # close the connections by resetting the pool wn._db.pool = {} with pytest.raises(sqlite3.ProgrammingError): WNThread() wn._db.pool = {} wn.config.allow_multithreading = True WNThread() # no error wn.config.allow_multithreading = False wn._db.pool = {} def test_remove_extension(datadir, tmp_path): old_data_dir = wn.config.data_directory wn.config.data_directory = tmp_path / "wn_data_1_1_trigger" wn.add(datadir / "mini-lmf-1.0.xml") wn.add(datadir / "mini-lmf-1.1.xml") assert len(wn.lexicons()) == 4 wn.remove("test-en-ext") assert len(wn.lexicons()) == 3 wn.remove("test-ja") assert len(wn.lexicons()) == 2 wn.add(datadir / "mini-lmf-1.1.xml") assert len(wn.lexicons()) == 4 wn.remove("test-en") assert {lex.id for lex in wn.lexicons()} == {"test-es", "test-ja"} wn.config.data_directory = old_data_dir # close any open DB connections before teardown for conn in wn._db.pool.values(): conn.close() def test_add_lexical_resource(datadir, tmp_path): old_data_dir = wn.config.data_directory wn.config.data_directory = tmp_path / "wn_data_add_lexical_resource" wn.add_lexical_resource(lmf.load(datadir / "mini-lmf-1.0.xml")) assert len(wn.lexicons()) == 2 wn.add_lexical_resource(lmf.load(datadir / "mini-lmf-1.1.xml")) assert len(wn.lexicons()) == 4 wn.config.data_directory = old_data_dir # close any open DB connections before teardown for conn in wn._db.pool.values(): conn.close() @pytest.mark.usefixtures("empty_db") def test_reset_database(datadir): wn.add(datadir / "mini-lmf-1.0.xml") assert {lex.specifier() for lex in wn.lexicons()} == {"test-en:1", "test-es:1"} wn.reset_database(rebuild=False) # cannot rebuild from unindexed local files assert wn.lexicons() == [] wn-1.0.0/tests/export_test.py000066400000000000000000000040521513755206300162400ustar00rootroot00000000000000from xml.etree import ElementTree as ET import pytest import wn @pytest.mark.usefixtures("mini_db") def test_export(datadir, tmp_path): tmpdir = tmp_path / "test_export" tmpdir.mkdir() tmppath = tmpdir / "mini_lmf_export.xml" lexicons = wn.lexicons(lexicon="test-en test-es") wn.export(lexicons, tmppath, version="1.0") # remove comments, indentation, etc. orig = ET.canonicalize(from_file=datadir / "mini-lmf-1.0.xml", strip_text=True) temp = ET.canonicalize(from_file=tmppath, strip_text=True) # additional transformation to help with debugging orig = orig.replace("<", "\n<") temp = temp.replace("<", "\n<") assert orig == temp @pytest.mark.usefixtures("mini_db_1_1") def test_export_1_1(datadir, tmp_path): tmpdir = tmp_path / "test_export_1_1" tmpdir.mkdir() tmppath = tmpdir / "mini_lmf_export_1_1.xml" lexicons = wn.lexicons(lexicon="test-ja test-en-ext") wn.export(lexicons, tmppath, version="1.1") # remove comments, indentation, etc. orig = ET.canonicalize(from_file=datadir / "mini-lmf-1.1.xml", strip_text=True) temp = ET.canonicalize(from_file=tmppath, strip_text=True) # additional transformation to help with debugging orig = orig.replace("<", "\n<") temp = temp.replace("<", "\n<") assert orig == temp # fails when exporting to WN-LMF 1.0 with pytest.raises(wn.Error): wn.export(lexicons, tmppath, version="1.0") @pytest.mark.usefixtures("mini_db_1_4") def test_export_1_4(datadir, tmp_path): tmpdir = tmp_path / "test_export_1_4" tmpdir.mkdir() tmppath = tmpdir / "mini_lmf_export_1_4.xml" lexicons = wn.lexicons(lexicon="test-1.4 test-ext-1.4") wn.export(lexicons, tmppath, version="1.4") # remove comments, indentation, etc. orig = ET.canonicalize(from_file=datadir / "mini-lmf-1.4.xml", strip_text=True) temp = ET.canonicalize(from_file=tmppath, strip_text=True) # additional transformation to help with debugging orig = orig.replace("<", "\n<") temp = temp.replace("<", "\n<") assert orig == temp wn-1.0.0/tests/ic_test.py000066400000000000000000000073751513755206300153250ustar00rootroot00000000000000from math import log import pytest import wn import wn.ic from wn.constants import ADJ, ADV, NOUN, VERB from wn.util import synset_id_formatter synset_id = { "information": "test-en-0001-n", "illustration_example": "test-en-0002-n", "sample": "test-en-0004-n", "random_sample": "test-en-0005-n", "random_sample2": "test-en-0008-n", # no hypernyms "datum": "test-en-0006-n", "illustrate_exemplify": "test-en-0003-v", "resignate": "test-en-0007-v", } words = [ "For", "example", ":", "random sample", ".", "This", "will", "illustrate", "and", "exemplify", ".", "A", "sample", "of", "data", ".", ] @pytest.mark.usefixtures("mini_db") def test_compute_nodistribute_nosmoothing(): w = wn.Wordnet("test-en:1") assert wn.ic.compute(words, w, distribute_weight=False, smoothing=0) == { NOUN: { synset_id["information"]: 4.0, synset_id["illustration_example"]: 3.0, synset_id["sample"]: 2.0, synset_id["random_sample"]: 1.0, synset_id["random_sample2"]: 1.0, synset_id["datum"]: 1.0, None: 5.0, }, VERB: { synset_id["illustrate_exemplify"]: 2.0, synset_id["resignate"]: 0.0, None: 2.0, }, ADJ: {None: 0.0}, ADV: {None: 0.0}, } @pytest.mark.usefixtures("mini_db") def test_compute_nodistribute_smoothing(): w = wn.Wordnet("test-en:1") assert wn.ic.compute(words, w, distribute_weight=False, smoothing=1.0) == { NOUN: { synset_id["information"]: 5.0, synset_id["illustration_example"]: 4.0, synset_id["sample"]: 3.0, synset_id["random_sample"]: 2.0, synset_id["random_sample2"]: 2.0, synset_id["datum"]: 2.0, None: 6.0, }, VERB: { synset_id["illustrate_exemplify"]: 3.0, synset_id["resignate"]: 1.0, None: 3.0, }, ADJ: {None: 1.0}, ADV: {None: 1.0}, } @pytest.mark.usefixtures("mini_db") def test_compute_distribute_smoothing(): w = wn.Wordnet("test-en:1") assert wn.ic.compute(words, w, distribute_weight=True, smoothing=1.0) == { NOUN: { synset_id["information"]: 4.5, synset_id["illustration_example"]: 3.5, synset_id["sample"]: 2.5, synset_id["random_sample"]: 1.5, synset_id["random_sample2"]: 1.5, synset_id["datum"]: 2.0, None: 5.0, }, VERB: { synset_id["illustrate_exemplify"]: 3.0, synset_id["resignate"]: 1.0, None: 3.0, }, ADJ: {None: 1.0}, ADV: {None: 1.0}, } @pytest.mark.usefixtures("mini_db") def test_load(tmp_path): w = wn.Wordnet("test-en:1") icpath = tmp_path / "foo.dat" icpath.write_text( "wnver:1234567890AbCdEf\n" "1n 4.0 ROOT\n" "2n 3.0\n" "4n 2.0\n" "5n 1.0\n" "8n 1.0 ROOT\n" "6n 1.0\n" "3v 2.0 ROOT\n" "7v 0.0 ROOT\n" ) get_synset_id = synset_id_formatter("test-en-{offset:04}-{pos}") assert wn.ic.load(icpath, w, get_synset_id=get_synset_id) == wn.ic.compute( words, w, distribute_weight=False, smoothing=0.0 ) @pytest.mark.usefixtures("mini_db") def test_information_content(): w = wn.Wordnet("test-en:1") ic = wn.ic.compute(words, w) info = w.synsets("information")[0] samp = w.synsets("sample")[0] # info is a root but not the only one, so its IC is not 0.0 assert wn.ic.information_content(info, ic) == -log(ic["n"][info.id] / ic["n"][None]) assert wn.ic.information_content(samp, ic) == -log(ic["n"][samp.id] / ic["n"][None]) wn-1.0.0/tests/ili_test.py000066400000000000000000000052031513755206300154730ustar00rootroot00000000000000from pathlib import Path import pytest import wn from wn import ili I67447_DEFN = "knowledge acquired through study or experience or instruction" def test_is_ili_tsv(datadir: Path) -> None: assert ili.is_ili_tsv(datadir / "mini-ili.tsv") assert ili.is_ili_tsv(datadir / "mini-ili-with-status.tsv") assert not ili.is_ili_tsv(datadir / "mini-lmf-1.0.xml") assert not ili.is_ili_tsv(datadir / "does-not-exist") def test_load_tsv(datadir: Path) -> None: assert list(ili.load_tsv(datadir / "mini-ili.tsv")) == [ {"ili": "i1", "definition": "i1 definition"}, {"ili": "i2", "definition": ""}, {"ili": "i67447", "definition": I67447_DEFN}, ] assert list(ili.load_tsv(datadir / "mini-ili-with-status.tsv")) == [ {"ili": "i1", "definition": "i1 definition", "status": "active"}, {"ili": "i2", "definition": "", "status": "deprecated"}, {"ili": "i67447", "definition": I67447_DEFN, "status": "active"}, ] @pytest.mark.usefixtures("mini_db") def test_get() -> None: # present in ili file, not in lexicon i = ili.get("i1") assert i.id == "i1" assert i.status == ili.ILIStatus.ACTIVE assert i.definition() == "i1 definition" defn = i.definition(data=True) assert defn.text == "i1 definition" assert defn.metadata() == {} assert defn.confidence() == 1.0 # present in lexicon, not in ili file i = ili.get("i67469") assert i.id == "i67469" assert i.status == ili.ILIStatus.PRESUPPOSED assert i.definition() is None assert i.definition(data=True) is None # present in ili file and lexicon i = ili.get("i67447") assert i.id == "i67447" assert i.status == ili.ILIStatus.ACTIVE assert i.definition() == I67447_DEFN defn = i.definition(data=True) assert defn.text == I67447_DEFN assert defn.metadata() == {} assert defn.confidence() == 1.0 @pytest.mark.usefixtures("mini_db") def test_get_proposed() -> None: proposed_defn = "to fire someone while making it look like it was their idea" # synset with proposed ili ss = wn.synset("test-en-0007-v", lexicon="test-en") i = ili.get_proposed(ss) assert i is not None assert i.id is None assert i.synset() == ss assert i.status == ili.ILIStatus.PROPOSED assert i.lexicon() == ss.lexicon() assert i.definition() == proposed_defn defn = i.definition(data=True) assert defn.text == proposed_defn assert defn.metadata() == {"creator": "MM"} assert defn.confidence() == 0.9 # inherited from lexicon # synset without proposed ili ss = wn.synset("test-en-0006-n", lexicon="test-en") assert ili.get_proposed(ss) is None wn-1.0.0/tests/lmf_test.py000066400000000000000000000135521513755206300155020ustar00rootroot00000000000000from xml.etree import ElementTree as ET from wn import lmf def test_is_lmf(datadir): assert lmf.is_lmf(datadir / "mini-lmf-1.0.xml") assert lmf.is_lmf(str(datadir / "mini-lmf-1.0.xml")) assert not lmf.is_lmf(datadir / "README.md") assert not lmf.is_lmf(datadir / "missing.xml") assert lmf.is_lmf(datadir / "mini-lmf-1.1.xml") def test_scan_lexicons(datadir): assert lmf.scan_lexicons(datadir / "mini-lmf-1.0.xml") == [ { "id": "test-en", "version": "1", "label": "Testing English WordNet", "extends": None, }, { "id": "test-es", "version": "1", "label": "Testing Spanish WordNet", "extends": None, }, ] assert lmf.scan_lexicons(datadir / "mini-lmf-1.1.xml") == [ { "id": "test-ja", "version": "1", "label": "Testing Japanese WordNet", "extends": None, }, { "id": "test-en-ext", "version": "1", "label": "Testing English Extension", "extends": { "id": "test-en", "version": "1", }, }, ] def test_load_1_0(datadir): resource = lmf.load(datadir / "mini-lmf-1.0.xml") lexicons = resource["lexicons"] assert len(lexicons) == 2 lexicon = lexicons[0] assert lexicon["id"] == "test-en" assert lexicon["label"] == "Testing English WordNet" assert lexicon["language"] == "en" assert lexicon["email"] == "maintainer@example.com" assert lexicon["license"] == "https://creativecommons.org/licenses/by/4.0/" assert lexicon["version"] == "1" assert lexicon["url"] == "https://example.com/test-en" assert len(lexicon["entries"]) == 9 le = lexicon["entries"][0] assert le["id"] == "test-en-information-n" assert le["lemma"]["writtenForm"] == "information" assert le["lemma"]["partOfSpeech"] == "n" assert le["lemma"]["script"] == "Latn" assert len(le["lemma"]["tags"]) == 1 assert len(le.get("forms", [])) == 0 assert len(le["senses"]) == 1 sense = le["senses"][0] assert sense["id"] == "test-en-information-n-0001-01" assert sense["synset"] == "test-en-0001-n" assert len(sense.get("relations", [])) == 0 # assert sense["relations"][0]["target"] == "test-en-exemplify-v-01023137-01" # assert sense["relations"][0]["type"] == "derivation" assert len(lexicon.get("frames", [])) == 0 # frames are on lexical entry assert len(lexicon["entries"][6]["frames"]) == 2 frames = lexicon["entries"][6]["frames"] assert frames[0]["subcategorizationFrame"] == "Somebody ----s something" assert frames[0]["senses"] == ["test-en-illustrate-v-0003-01"] assert len(lexicon["synsets"]) == 8 assert lexicons[1]["id"] == "test-es" def test_load_1_1(datadir): resource = lmf.load(datadir / "mini-lmf-1.1.xml") lexicons = resource["lexicons"] assert len(lexicons) == 2 lexicon = lexicons[0] assert lexicon["id"] == "test-ja" assert lexicon["version"] == "1" # assert lexicon.logo == "logo.svg" assert lexicon.get("requires") == [{"id": "test-en", "version": "1"}] lexicon = lexicons[1] assert lexicon["id"] == "test-en-ext" assert lexicon.get("extends") == { "id": "test-en", "url": "https://example.com/test-en", "version": "1", } def test_load_1_3(datadir): resource = lmf.load(datadir / "mini-lmf-1.3.xml") lexicons = resource["lexicons"] assert len(lexicons) == 1 lexicon = lexicons[0] synsets = lexicon["synsets"] assert synsets[0]["definitions"][0]["text"] == "one two three" assert synsets[1]["definitions"][0]["text"] == "one two three" assert ( synsets[2]["definitions"][0]["text"] == """ one two three """ ) def test_load_1_4(datadir): resource = lmf.load(datadir / "mini-lmf-1.4.xml") lexicons = resource["lexicons"] assert len(lexicons) == 2 lexicon = lexicons[0] assert lexicon["entries"][0].get("index") == "foo_bar" assert lexicon["entries"][1].get("index") == "foo_bar" assert lexicon["entries"][2].get("index") is None assert lexicon["entries"][3].get("index") == "baz" assert lexicon["entries"][4].get("index") is None assert lexicon["entries"][5].get("index") == "baz" assert lexicon["entries"][0]["senses"][0].get("n") == 3 assert lexicon["entries"][1]["senses"][0].get("n") == 2 assert lexicon["entries"][1]["senses"][1].get("n") == 1 assert lexicon["entries"][2]["senses"][0].get("n") is None assert lexicon["entries"][3]["senses"][0].get("n") == 2 assert lexicon["entries"][4]["senses"][0].get("n") == 2 assert lexicon["entries"][4]["senses"][1].get("n") is None assert lexicon["entries"][5]["senses"][0].get("n") == 1 extension = lexicons[1] assert extension["id"] == "test-ext-1.4" assert extension.get("extends") == { "id": "test-en", "version": "1", "url": "https://example.com/test-en", } def test_dump(datadir, tmp_path): tmpdir = tmp_path / "test_dump" tmpdir.mkdir() tmppath = tmpdir / "mini_lmf_dump.xml" def assert_xml_equal(mini_lmf, dump_lmf): orig = ET.canonicalize(from_file=mini_lmf, strip_text=True) temp = ET.canonicalize(from_file=dump_lmf, strip_text=True) # additional transformation to help with debugging orig = orig.replace("<", "\n<") temp = temp.replace("<", "\n<") assert orig == temp lmf.dump(lmf.load(datadir / "mini-lmf-1.0.xml"), tmppath) assert_xml_equal(datadir / "mini-lmf-1.0.xml", tmppath) lmf.dump(lmf.load(datadir / "mini-lmf-1.1.xml"), tmppath) assert_xml_equal(datadir / "mini-lmf-1.1.xml", tmppath) lmf.dump(lmf.load(datadir / "mini-lmf-1.4.xml"), tmppath) assert_xml_equal(datadir / "mini-lmf-1.4.xml", tmppath) wn-1.0.0/tests/morphy_test.py000066400000000000000000000040441513755206300162360ustar00rootroot00000000000000import pytest import wn from wn import morphy def test_morphy_uninitialized(): # An unintialized Morphy isn't very bright, but it starts up # fast. It relies on the database to filter bad items. m = morphy.Morphy() assert m("example", "n") == {"n": {"example"}} assert m("examples", "n") == {"n": {"examples", "example"}} assert m("examples", "v") == {"v": {"examples", "example", "exampl"}} assert m("exemplifying", "n") == {"n": {"exemplifying"}} assert m("exemplifying", "v") == {"v": {"exemplifying", "exemplify", "exemplifye"}} assert m("data", "n") == {"n": {"data"}} assert m("datums", "n") == {"n": {"datums", "datum"}} # expected false positive assert m("examples", None) == { None: {"examples"}, "n": {"example"}, "v": {"example", "exampl"}, } assert m("exemplifying", None) == { None: {"exemplifying"}, "v": {"exemplify", "exemplifye"}, } assert m("data", None) == {None: {"data"}} @pytest.mark.usefixtures("mini_db") def test_morphy_initialized(): w = wn.Wordnet("test-en:1") m = morphy.Morphy(wordnet=w) assert m("example", "n") == {"n": {"example"}} assert m("examples", "n") == {"n": {"example"}} assert m("examples", "v") == {} assert m("exemplifying", "n") == {} assert m("exemplifying", "v") == {"v": {"exemplify"}} assert m("data", "n") == {"n": {"datum"}} assert m("datums", "n") == {"n": {"datum"}} # expected false positive assert m("examples", None) == {"n": {"example"}} assert m("exemplifying", None) == {"v": {"exemplify"}} assert m("data", None) == {"n": {"datum"}} @pytest.mark.usefixtures("mini_db") def test_issue_154(): # https://github.com/goodmami/wn/issues/154 w = wn.Wordnet("test-en:1") assert w.words("exemplifies") == [w.word("test-en-exemplify-v")] assert w.words("samples") == [] w = wn.Wordnet("test-en:1", lemmatizer=morphy.Morphy()) assert w.words("exemplifies") == [w.word("test-en-exemplify-v")] assert w.words("samples") == [w.word("test-en-sample-n")] wn-1.0.0/tests/primary_query_test.py000066400000000000000000000325041513755206300176320ustar00rootroot00000000000000import pytest import wn @pytest.mark.usefixtures("uninitialized_datadir") def test_lexicons_uninitialized(): assert len(wn.lexicons()) == 0 @pytest.mark.usefixtures("empty_db") def test_lexicons_empty(): assert len(wn.lexicons()) == 0 @pytest.mark.usefixtures("mini_db") def test_lexicons_mini(): assert len(wn.lexicons()) == 2 assert all(isinstance(lex, wn.Lexicon) for lex in wn.lexicons()) results = wn.lexicons(lang="en") assert len(results) == 1 assert results[0].language == "en" results = wn.lexicons(lang="es") assert len(results) == 1 assert results[0].language == "es" assert len(wn.lexicons(lexicon="*")) == 2 assert len(wn.lexicons(lexicon="*:1")) == 2 assert len(wn.lexicons(lexicon="test-*")) == 2 assert len(wn.lexicons(lexicon="*-en")) == 1 results = wn.lexicons(lexicon="test-en") assert len(results) == 1 assert results[0].language == "en" results = wn.lexicons(lexicon="test-en:1") assert len(results) == 1 assert results[0].language == "en" results = wn.lexicons(lexicon="test-en:*") assert len(results) == 1 assert results[0].language == "en" assert wn.lexicons(lexicon="test-en")[0].specifier() == "test-en:1" assert wn.lexicons(lexicon="test-es")[0].specifier() == "test-es:1" assert wn.lexicons(lexicon="test-en")[0].requires() == {} assert wn.lexicons(lexicon="test-es")[0].requires() == {} lex = wn.lexicons(lexicon="test-en")[0] # hashability assert {lex: "foo"}[lex] == "foo" @pytest.mark.usefixtures("mini_db") def test_lexicons_unknown(): results = wn.lexicons(lang="unk") assert len(results) == 0 results = wn.lexicons(lexicon="test-unk") assert len(results) == 0 @pytest.mark.usefixtures("empty_db") def test_words_empty(): assert len(wn.words()) == 0 @pytest.mark.usefixtures("mini_db") def test_words_mini(): assert len(wn.words()) == 15 assert all(isinstance(w, wn.Word) for w in wn.words()) words = wn.words("information") # search lemma assert len(words) == 1 assert words[0].lemma() == "information" lemma = words[0].lemma(data=True) assert lemma.value == "information" assert lemma.script == "Latn" assert lemma.tags() == [wn.Tag("tag-text", "tag-category")] words = wn.words("exemplifies") # search secondary form assert len(words) == 1 assert words[0].lemma() == "exemplify" assert len(wn.words(pos="n")) == 10 assert all(w.pos == "n" for w in wn.words(pos="n")) assert len(wn.words(pos="v")) == 5 assert len(wn.words(pos="q")) == 0 # fake pos assert len(wn.words(lang="en")) == 9 assert len(wn.words(lang="es")) == 6 assert len(wn.words(lexicon="test-en")) == 9 assert len(wn.words(lexicon="test-es")) == 6 assert len(wn.words(lang="en", lexicon="test-en")) == 9 assert len(wn.words(pos="v", lang="en")) == 3 assert len(wn.words("information", lang="en")) == 1 assert len(wn.words("information", lang="es")) == 0 with pytest.raises(wn.Error): wn.words(lang="unk") with pytest.raises(wn.Error): wn.words(lexicon="test-unk") @pytest.mark.usefixtures("empty_db") def test_lemmas_empty(): assert len(wn.lemmas()) == 0 @pytest.mark.usefixtures("mini_db_1_4") def test_lemmas_mini_1_4(): wordnet = wn.Wordnet(lexicon="test-1.4") all_lemmas = wordnet.lemmas() assert len(all_lemmas) == 5 assert all(isinstance(lemma, str) for lemma in all_lemmas) assert all_lemmas == ["Foo Bar", "foo bar", "baz", "BAZ", "Baz"] # data=True should return Form objects and should not dedup lemmas_with_data = wordnet.lemmas(data=True) assert len(lemmas_with_data) == 6 # includes duplicate 'baz' assert all(isinstance(lemma, wn.Form) for lemma in lemmas_with_data) assert [f.value for f in lemmas_with_data] == [ "Foo Bar", "foo bar", "baz", "BAZ", "Baz", "baz", ] # Test deduplication baz_lemmas = wordnet.lemmas("baz", data=False) assert baz_lemmas == ["baz", "BAZ", "Baz"] # With data=True, no dedup baz_forms = wordnet.lemmas("baz", data=True) assert [f.value for f in baz_forms] == ["baz", "BAZ", "Baz", "baz"] # Filter by POS assert len(wordnet.lemmas(pos="n")) == 5 # Foo Bar, foo bar, baz, BAZ, Baz assert len(wordnet.lemmas(pos="v")) == 1 # baz assert len(wordnet.lemmas(pos="q")) == 0 # fake pos # Verify lemmas() returns same results as words() + .lemma() words = wordnet.words() lemmas_from_words = [w.lemma() for w in words] lemmas_direct = wordnet.lemmas() assert set(lemmas_from_words) == set(lemmas_direct) # Test wn module function to wordnet instance method assert wn.lemmas(lexicon="test-1.4") == wordnet.lemmas() assert wn.lemmas(data=True, lexicon="test-1.4") == wordnet.lemmas(data=True) with pytest.raises(wn.Error): wn.lemmas(lang="unk") with pytest.raises(wn.Error): wn.lemmas(lexicon="test-unk") @pytest.mark.usefixtures("empty_db") def test_word_empty(): with pytest.raises(wn.Error): assert wn.word("test-es-información-n") @pytest.mark.usefixtures("mini_db") def test_word_mini(): assert wn.word("test-es-información-n") assert wn.word("test-es-información-n", lang="es") assert wn.word("test-es-información-n", lexicon="test-es") with pytest.raises(wn.Error): assert wn.word("test-es-información-n", lang="en") with pytest.raises(wn.Error): assert wn.word("test-es-información-n", lexicon="test-en") with pytest.raises(wn.Error): assert wn.word("test-es-información-n", lang="unk") with pytest.raises(wn.Error): assert wn.word("test-es-información-n", lexicon="test-unk") @pytest.mark.usefixtures("empty_db") def test_senses_empty(): assert len(wn.senses()) == 0 @pytest.mark.usefixtures("mini_db") def test_senses_mini(): assert len(wn.senses()) == 16 assert all(isinstance(s, wn.Sense) for s in wn.senses()) senses = wn.senses("information") # search lemma assert len(senses) == 1 assert senses[0].word().lemma() == "information" assert senses[0].counts() == [3] senses = wn.senses("exemplifies") # search secondary form assert len(senses) == 1 assert senses[0].word().lemma() == "exemplify" assert senses[0].word().lemma() in {"exemplify"} assert "exemplify" in {senses[0].word().lemma()} assert len(wn.senses(pos="n")) == 11 assert len(wn.senses(pos="v")) == 5 assert len(wn.senses(pos="q")) == 0 # fake pos assert len(wn.senses(lang="en")) == 10 assert len(wn.senses(lang="es")) == 6 assert len(wn.senses(lexicon="test-en")) == 10 assert len(wn.senses(lexicon="test-es")) == 6 assert len(wn.senses(lang="en", lexicon="test-en")) == 10 assert len(wn.senses(pos="v", lang="en")) == 3 assert len(wn.senses("information", lang="en")) == 1 assert len(wn.senses("information", lang="es")) == 0 with pytest.raises(wn.Error): wn.senses(lang="unk") with pytest.raises(wn.Error): wn.senses(lexicon="test-unk") @pytest.mark.usefixtures("empty_db") def test_sense_empty(): with pytest.raises(wn.Error): assert wn.sense("test-es-información-n-0001-01") @pytest.mark.usefixtures("mini_db") def test_sense_mini(): assert wn.sense("test-es-información-n-0001-01") assert wn.sense("test-es-información-n-0001-01", lang="es") assert wn.sense("test-es-información-n-0001-01", lexicon="test-es") with pytest.raises(wn.Error): assert wn.sense("test-es-información-n-0001-01", lang="en") with pytest.raises(wn.Error): assert wn.sense("test-es-información-n-0001-01", lexicon="test-en") with pytest.raises(wn.Error): assert wn.sense("test-es-información-n-0001-01", lang="unk") with pytest.raises(wn.Error): assert wn.sense("test-es-información-n-0001-01", lexicon="test-unk") @pytest.mark.usefixtures("empty_db") def test_synsets_empty(): assert len(wn.synsets()) == 0 @pytest.mark.usefixtures("mini_db") def test_synsets_mini(): assert len(wn.synsets()) == 12 assert all(isinstance(ss, wn.Synset) for ss in wn.synsets()) synsets = wn.synsets("information") # search lemma assert len(synsets) == 1 assert "information" in synsets[0].lemmas() synsets = wn.synsets("exemplifies") # search secondary form assert len(synsets) == 1 assert "exemplify" in synsets[0].lemmas() assert len(wn.synsets(pos="n")) == 9 assert len(wn.synsets(pos="v")) == 3 assert len(wn.synsets(pos="q")) == 0 # fake pos assert len(wn.synsets(ili="i67469")) == 2 assert len(wn.synsets(ili="i67468")) == 0 assert len(wn.synsets(lang="en")) == 8 assert len(wn.synsets(lang="es")) == 4 assert len(wn.synsets(lexicon="test-en")) == 8 assert len(wn.synsets(lexicon="test-es")) == 4 assert len(wn.synsets(lang="en", lexicon="test-en")) == 8 assert len(wn.synsets(pos="v", lang="en")) == 2 assert len(wn.synsets("information", lang="en")) == 1 assert len(wn.synsets("information", lang="es")) == 0 assert len(wn.synsets(ili="i67469", lang="es")) == 1 with pytest.raises(wn.Error): wn.synsets(lang="unk") with pytest.raises(wn.Error): wn.synsets(lexicon="test-unk") @pytest.mark.usefixtures("empty_db") def test_synset_empty(): with pytest.raises(wn.Error): assert wn.synset("test-es-0001-n") @pytest.mark.usefixtures("mini_db") def test_synset_mini(): assert wn.synset("test-es-0001-n") assert wn.synset("test-es-0001-n", lang="es") assert wn.synset("test-es-0001-n", lexicon="test-es") with pytest.raises(wn.Error): assert wn.synset("test-es-0001-n", lang="en") with pytest.raises(wn.Error): assert wn.synset("test-es-0001-n", lexicon="test-en") with pytest.raises(wn.Error): assert wn.synset("test-es-0001-n", lang="unk") with pytest.raises(wn.Error): assert wn.synset("test-es-0001-n", lexicon="test-unk") @pytest.mark.usefixtures("mini_db_1_1") def test_mini_1_1(): assert len(wn.lexicons()) == 4 assert len(wn.lexicons(lang="en")) == 2 assert len(wn.lexicons(lang="ja")) == 1 assert wn.lexicons(lang="ja")[0].logo == "logo.svg" w = wn.Wordnet(lang="en") assert len(w.lexicons()) == 2 assert len(w.expanded_lexicons()) == 0 assert len(w.word("test-en-exemplify-v").lemma(data=True).tags()) == 1 w = wn.Wordnet(lang="ja") assert len(w.lexicons()) == 1 assert len(w.expanded_lexicons()) == 1 assert len(w.synsets("例え")[0].hypernyms()) == 1 assert w.synsets("例え")[0].lexfile() == "noun.cognition" assert len(w.word("test-ja-例え-n").lemma(data=True).pronunciations()) == 1 assert w.word("test-ja-例え-n").forms(data=True)[1].id == "test-ja-例え-n-たとえ" p = w.word("test-ja-例え-n").lemma(data=True).pronunciations()[0] assert p.value == "tatoe" assert p.variety == "standard" assert p.notation == "ipa" assert p.phonemic assert p.audio == "tatoe.wav" w = wn.Wordnet(lang="ja", expand="") assert len(w.lexicons()) == 1 assert len(w.expanded_lexicons()) == 0 assert len(w.synsets("例え")[0].hypernyms()) == 0 w = wn.Wordnet(lexicon="test-en test-en-ext") assert len(w.lexicons()) == 2 assert len(w.expanded_lexicons()) == 0 assert len(w.synsets("fire")[0].hyponyms()) == 1 @pytest.mark.usefixtures("mini_db_1_1") def test_mini_1_1_lexicons(): lex = wn.lexicons(lexicon="test-en")[0] assert lex.specifier() == "test-en:1" assert not lex.requires() assert lex.extends() is None assert len(lex.extensions()) == 1 assert lex.extensions()[0].specifier() == "test-en-ext:1" lex = wn.lexicons(lexicon="test-es")[0] assert lex.specifier() == "test-es:1" assert not lex.requires() assert lex.extends() is None assert len(lex.extensions()) == 0 lex = wn.lexicons(lexicon="test-en-ext")[0] assert lex.specifier() == "test-en-ext:1" assert not lex.requires() assert lex.extends() is not None assert lex.extends().specifier() == "test-en:1" assert len(lex.extensions()) == 0 lex = wn.lexicons(lexicon="test-ja")[0] assert lex.specifier() == "test-ja:1" assert "test-en:1" in lex.requires() assert lex.extends() is None assert len(lex.extensions()) == 0 @pytest.mark.usefixtures("mini_db_1_4") def test_mini_1_4(): w = wn.Wordnet("test-1.4:1", normalizer=None) # even without a normalizer, entries sharing an index are matched assert len(w.words("Foo Bar")) == 2 assert len(w.words("foo bar")) == 2 # if the index is missing, the lemma is used; normalization doesn't happen assert len(w.words("baz")) == 3 assert len(w.words("Baz")) == 1 # sense order follows values of 'n' assert [s.id for s in w.senses("foo bar")] == [ "test-1.4-foo_bar-n-2", "test-1.4-foo_bar-n-1", "test-1.4-Foo_Bar-n-1", ] assert [s.id for s in w.senses("baz")] == [ "test-1.4-baz-n-1", "test-1.4-BAZ-n-1", "test-1.4-baz-v-1", ] assert [s.id for s in w.senses("baz", pos="v")] == [ "test-1.4-baz-v-1", ] # order is undecided when implicit or explicit valus of n are overlapping assert {s.id for s in w.senses("Baz")} == { "test-1.4-Baz-n-1", "test-1.4-Baz-n-2", } # synset order also follows index assert [ss.id for ss in w.synsets("foo bar")] == [ "test-1.4-2", "test-1.4-1", ] wn-1.0.0/tests/project_test.py000066400000000000000000000037231513755206300163710ustar00rootroot00000000000000from wn import project def test_is_package_directory(datadir): assert project.is_package_directory(datadir / "test-package") assert not project.is_package_directory(datadir) def test_is_collection_directory(datadir): # not really, but it is a directory containing a package assert project.is_collection_directory(datadir) assert not project.is_collection_directory(datadir / "test-package") def test_get_project(datadir): proj = project.get_project(path=datadir / "test-package") assert proj.type == "wordnet" assert proj.resource_file() == datadir / "test-package" / "test-wn.xml" assert proj.readme() == datadir / "test-package" / "README.md" assert proj.license() == datadir / "test-package" / "LICENSE" assert proj.citation() == datadir / "test-package" / "citation.bib" proj = project.get_project(path=datadir / "mini-lmf-1.0.xml") assert proj.type == "wordnet" assert proj.resource_file() == datadir / "mini-lmf-1.0.xml" assert proj.readme() is None assert proj.license() is None assert proj.citation() is None def test_iterpackages(datadir): # for now, collection.packages() does not return contained resource files pkg_names = {pkg.resource_file().name for pkg in project.iterpackages(datadir)} assert "mini-lmf-1.0.xml" not in pkg_names assert "test-wn.xml" in pkg_names # explicitly giving a resource file path works, though pkg_names = { pkg.resource_file().name for pkg in project.iterpackages(datadir / "mini-lmf-1.0.xml") } assert "mini-lmf-1.0.xml" in pkg_names assert "test-wn.xml" not in pkg_names def test_compressed_iterpackages(mini_lmf_compressed): for pkg in project.iterpackages(mini_lmf_compressed): assert pkg.type == "wordnet" assert pkg.resource_file().exists() # ensure cleanup of temporary data assert not pkg.resource_file().exists() # ensure original file not deleted assert mini_lmf_compressed.exists() wn-1.0.0/tests/relations_test.py000066400000000000000000000134351513755206300167240ustar00rootroot00000000000000import pytest import wn @pytest.mark.usefixtures("mini_db") def test_word_derived_words(): assert len(wn.word("test-en-example-n").derived_words()) == 1 assert len(wn.word("test-es-ejemplo-n").derived_words()) == 1 @pytest.mark.usefixtures("mini_db") def test_synset_hypernyms(): assert wn.synset("test-en-0002-n").hypernyms() == [wn.synset("test-en-0001-n")] assert wn.synset("test-en-0001-n").hypernyms() == [] @pytest.mark.usefixtures("mini_db") def test_synset_hypernyms_expand_default(): assert wn.synset("test-es-0002-n").hypernyms() == [wn.synset("test-es-0001-n")] assert wn.synset("test-es-0001-n").hypernyms() == [] @pytest.mark.usefixtures("mini_db") def test_synset_hypernyms_expand_empty(): w = wn.Wordnet(lang="es", expand="") assert w.synset("test-es-0002-n").hypernyms() == [] @pytest.mark.usefixtures("mini_db") def test_synset_hypernyms_expand_specified(): w = wn.Wordnet(lang="es", expand="test-en") assert w.synset("test-es-0002-n").hypernyms() == [w.synset("test-es-0001-n")] @pytest.mark.usefixtures("mini_db") def test_synset_relations(): w = wn.Wordnet(lang="en") assert w.synset("test-en-0002-n").relations() == { "hypernym": [w.synset("test-en-0001-n")], "hyponym": [w.synset("test-en-0004-n")], } @pytest.mark.usefixtures("mini_db") def test_sense_get_related(): w = wn.Wordnet("test-en") assert w.sense("test-en-example-n-0002-01").get_related() == [ w.sense("test-en-exemplify-v-0003-01") ] @pytest.mark.usefixtures("mini_db") def test_sense_relations(): w = wn.Wordnet("test-en") assert w.sense("test-en-example-n-0002-01").relations() == { "derivation": [w.sense("test-en-exemplify-v-0003-01")] } @pytest.mark.usefixtures("mini_db_1_1") def test_extension_relations(): # default mode assert wn.synset("test-en-0007-v").hypernyms() == [wn.synset("test-en-ext-0009-v")] assert wn.synset("test-en-ext-0009-v").hyponyms() == [wn.synset("test-en-0007-v")] assert wn.sense("test-en-information-n-0001-01").get_related("pertainym") == [ wn.sense("test-en-ext-info-n-0001-01") ] assert wn.sense("test-en-ext-info-n-0001-01").get_related("pertainym") == [ wn.sense("test-en-information-n-0001-01") ] # restricted to base w = wn.Wordnet(lexicon="test-en") assert w.synset("test-en-0007-v").hypernyms() == [] assert w.sense("test-en-information-n-0001-01").get_related("pertainym") == [] # base and extension w = wn.Wordnet(lexicon="test-en test-en-ext") assert w.synset("test-en-0007-v").hypernyms() == [w.synset("test-en-ext-0009-v")] assert w.synset("test-en-ext-0009-v").hyponyms() == [w.synset("test-en-0007-v")] assert w.sense("test-en-information-n-0001-01").get_related("pertainym") == [ w.sense("test-en-ext-info-n-0001-01") ] assert w.sense("test-en-ext-info-n-0001-01").get_related("pertainym") == [ w.sense("test-en-information-n-0001-01") ] # restricted to extension w = wn.Wordnet(lexicon="test-en-ext") assert w.synset("test-en-ext-0009-v").hyponyms() == [] assert w.sense("test-en-ext-info-n-0001-01").get_related("pertainym") == [] @pytest.mark.usefixtures("mini_db_1_1") def test_sense_synset_issue_168(): # https://github.com/goodmami/wn/issues/168 ja = wn.Wordnet(lexicon="test-ja", expand="") assert ja.synset("test-ja-0001-n").get_related() == [] assert ja.sense("test-ja-情報-n-0001-01").synset().get_related() == [] @pytest.mark.usefixtures("mini_db") def test_synset_relations_issue_169(): # https://github.com/goodmami/wn/issues/169 en = wn.Wordnet("test-en") assert list(en.synset("test-en-0001-n").relations("hyponym")) == ["hyponym"] es = wn.Wordnet("test-es", expand="test-en") assert list(es.synset("test-es-0001-n").relations("hyponym")) == ["hyponym"] @pytest.mark.usefixtures("mini_db") def test_synset_relations_issue_177(): # https://github.com/goodmami/wn/issues/177 assert "hyponym" in wn.synset("test-es-0001-n").relations() @pytest.mark.usefixtures("mini_db") def test_sense_relation_data_true(): en = wn.Wordnet("test-en") assert en.sense("test-en-information-n-0001-01").relations(data=True) == {} relmap = en.sense("test-en-illustrate-v-0003-01").relations(data=True) # only sense-sense relations by default assert len(relmap) == 3 assert all(isinstance(tgt, wn.Sense) for tgt in relmap.values()) assert {rel.name for rel in relmap} == {"derivation", "other"} assert {rel.target_id for rel in relmap} == {"test-en-illustration-n-0002-01"} # sense relations targets should always have same ids as resolved targets assert all(rel.target_id == tgt.id for rel, tgt in relmap.items()) @pytest.mark.usefixtures("mini_db") def test_synset_relations_data_true(): en = wn.Wordnet("test-en") assert en.synset("test-en-0003-v").relations(data=True) == {} relmap = en.synset("test-en-0002-n").relations(data=True) assert len(relmap) == 2 assert {rel.name for rel in relmap} == {"hypernym", "hyponym"} assert {rel.target_id for rel in relmap} == {"test-en-0001-n", "test-en-0004-n"} # synset relation targets have same ids as resolved targets in same lexicon assert all(rel.target_id == tgt.id for rel, tgt in relmap.items()) assert all(rel.lexicon().id == "test-en" for rel in relmap) # interlingual synset relation targets show original target ids es = wn.Wordnet("test-es", expand="test-en") relmap = es.synset("test-es-0002-n").relations(data=True) assert len(relmap) == 2 assert {rel.name for rel in relmap} == {"hypernym", "hyponym"} assert {rel.target_id for rel in relmap} == {"test-en-0001-n", "test-en-0004-n"} assert all(rel.target_id != tgt.id for rel, tgt in relmap.items()) assert all(rel.lexicon().id == "test-en" for rel in relmap) wn-1.0.0/tests/secondary_query_test.py000066400000000000000000000204341513755206300201350ustar00rootroot00000000000000import pytest import wn @pytest.mark.usefixtures("mini_db") def test_word_senses(): assert len(wn.word("test-en-information-n").senses()) == 1 assert len(wn.word("test-es-información-n").senses()) == 1 @pytest.mark.usefixtures("mini_db") def test_word_synsets(): assert len(wn.word("test-en-information-n").synsets()) == 1 assert len(wn.word("test-es-información-n").synsets()) == 1 @pytest.mark.usefixtures("mini_db") def test_word_translate(): assert len(wn.word("test-en-example-n").translate(lang="es")) == 1 assert len(wn.word("test-es-ejemplo-n").translate(lang="en")) == 1 @pytest.mark.usefixtures("mini_db_1_1") def test_word_lemma_tags(): en = wn.Wordnet("test-en") assert en.word("test-en-exemplify-v").lemma(data=True).tags() == [] ext = wn.Wordnet("test-en test-en-ext") assert ext.word("test-en-exemplify-v").lemma(data=True).tags() == [ wn.Tag(tag="INF", category="tense") ] @pytest.mark.usefixtures("mini_db_1_1") def test_word_lemma_pronunciations(): en = wn.Wordnet("test-en") assert en.word("test-en-information-n").lemma(data=True).pronunciations() == [] ext = wn.Wordnet("test-en test-en-ext") assert ext.word("test-en-information-n").lemma(data=True).pronunciations() == [ wn.Pronunciation(value="ˌɪnfəˈmeɪʃən", variety="GB"), # noqa: RUF001 wn.Pronunciation(value="ˌɪnfɚˈmeɪʃən", variety="US"), # noqa: RUF001 ] @pytest.mark.usefixtures("mini_db") def test_sense_word(): assert wn.sense("test-en-information-n-0001-01").word() == wn.word( "test-en-information-n" ) assert wn.sense("test-es-información-n-0001-01").word() == wn.word( "test-es-información-n" ) @pytest.mark.usefixtures("mini_db") def test_sense_synset(): assert wn.sense("test-en-information-n-0001-01").synset() == wn.synset( "test-en-0001-n" ) assert wn.sense("test-es-información-n-0001-01").synset() == wn.synset( "test-es-0001-n" ) @pytest.mark.usefixtures("mini_db") def test_sense_issue_157(): # https://github.com/goodmami/wn/issues/157 sense = wn.sense("test-en-information-n-0001-01") # This test uses non-public members, which is not ideal, but there # is currently no better alternative. assert sense._lexconf is sense.word()._lexconf assert sense._lexconf is sense.synset()._lexconf @pytest.mark.usefixtures("mini_db") def test_sense_examples(): assert wn.sense("test-en-information-n-0001-01").examples() == [] assert wn.sense("test-es-información-n-0001-01").examples() == [] @pytest.mark.usefixtures("mini_db") def test_sense_counts(): assert wn.sense("test-en-information-n-0001-01").counts() == [3] counts = wn.sense("test-en-information-n-0001-01").counts(data=True) assert counts[0].value == 3 assert counts[0].lexicon().specifier() == "test-en:1" assert wn.sense("test-es-información-n-0001-01").counts() == [] @pytest.mark.usefixtures("mini_db") def test_sense_lexicalized(): assert wn.sense("test-en-information-n-0001-01").lexicalized() assert wn.sense("test-es-información-n-0001-01").lexicalized() @pytest.mark.usefixtures("mini_db") def test_sense_frames(): assert wn.sense("test-en-illustrate-v-0003-01").frames() == [ "Somebody ----s something", "Something ----s something", ] assert wn.sense("test-es-ilustrar-v-0003-01").frames() == [] @pytest.mark.usefixtures("mini_db_1_1") def test_sense_frames_issue_156(): # https://github.com/goodmami/wn/issues/156 assert wn.sense("test-ja-示す-v-0003-01").frames() == [ "ある人が何かを----", ] assert wn.sense("test-ja-事例-n-0002-01").frames() == [] @pytest.mark.usefixtures("mini_db") def test_sense_translate(): assert len(wn.sense("test-en-information-n-0001-01").translate(lang="es")) == 1 assert len(wn.sense("test-es-información-n-0001-01").translate(lang="en")) == 1 @pytest.mark.usefixtures("mini_db") def test_synset_senses(): assert len(wn.synset("test-en-0003-v").senses()) == 2 assert len(wn.synset("test-es-0003-v").senses()) == 2 @pytest.mark.usefixtures("mini_db") def test_synset_words(): assert len(wn.synset("test-en-0003-v").words()) == 2 assert len(wn.synset("test-es-0003-v").words()) == 2 @pytest.mark.usefixtures("mini_db") def test_synset_lemmas(): assert wn.synset("test-en-0003-v").lemmas() == ["exemplify", "illustrate"] assert wn.synset("test-es-0003-v").lemmas() == ["ejemplificar", "ilustrar"] @pytest.mark.usefixtures("mini_db") def test_synset_ili(): # Synset ILIs are now just strings; see ili_test.py for wn.ili tests assert isinstance(wn.synset("test-en-0001-n").ili, str) @pytest.mark.usefixtures("mini_db") def test_synset_definition(): assert wn.synset("test-en-0001-n").definition() == "something that informs" defn = wn.synset("test-en-0001-n").definition(data=True) assert defn.source_sense_id == "test-en-information-n-0001-01" assert defn.lexicon().specifier() == "test-en:1" assert wn.synset("test-es-0001-n").definition() == "algo que informa" @pytest.mark.usefixtures("mini_db") def test_synset_definitions(): assert wn.synset("test-en-0001-n").definitions() == ["something that informs"] defns = wn.synset("test-en-0001-n").definitions(data=True) assert defns[0].source_sense_id == "test-en-information-n-0001-01" assert wn.synset("test-es-0001-n").definitions() == ["algo que informa"] @pytest.mark.usefixtures("mini_db") def test_synset_examples(): assert wn.synset("test-en-0001-n").examples() == ['"this is information"'] ex = wn.synset("test-en-0001-n").examples(data=True)[0] assert ex.text == '"this is information"' assert ex.lexicon().specifier() == "test-en:1" assert wn.synset("test-es-0001-n").examples() == ['"este es la información"'] @pytest.mark.usefixtures("mini_db") def test_synset_lexicalized(): assert wn.synset("test-en-0001-n").lexicalized() assert wn.synset("test-es-0001-n").lexicalized() @pytest.mark.usefixtures("mini_db") def test_synset_translate(): assert len(wn.synset("test-en-0001-n").translate(lang="es")) == 1 assert len(wn.synset("test-es-0001-n").translate(lang="en")) == 1 @pytest.mark.usefixtures("uninitialized_datadir") def test_word_sense_order(datadir): wn.add(datadir / "sense-member-order.xml") assert [s.id for s in wn.word("test-foo-n").senses()] == [ "test-01-foo-n", "test-02-foo-n", ] assert [s.id for s in wn.word("test-bar-n").senses()] == [ "test-02-bar-n", "test-01-bar-n", ] @pytest.mark.usefixtures("uninitialized_datadir") def test_synset_member_order(datadir): wn.add(datadir / "sense-member-order.xml") assert [s.id for s in wn.synset("test-01-n").senses()] == [ "test-01-bar-n", "test-01-foo-n", ] assert [s.id for s in wn.synset("test-02-n").senses()] == [ "test-02-bar-n", "test-02-foo-n", ] @pytest.mark.usefixtures("mini_db") def test_confidence(): # default for unmarked lexicon is 1.0 assert wn.lexicons(lexicon="test-es")[0].confidence() == 1.0 # explicitly set lexicon confidence becomes the default for sub-elements assert wn.lexicons(lexicon="test-en")[0].confidence() == 0.9 assert wn.word("test-en-information-n").confidence() == 0.9 assert wn.sense("test-en-information-n-0001-01").confidence() == 0.9 assert ( wn.sense("test-en-information-n-0001-01").counts(data=True)[0].confidence() ) == 0.9 assert ( wn.sense("test-en-exemplify-v-0003-01") .relations(data=True) .popitem()[0] .confidence() ) == 0.9 # explicit value overrides default assert wn.word("test-en-example-n").confidence() == 1.0 assert ( wn.sense("test-en-example-n-0002-01") .relations(data=True) .popitem()[0] .confidence() ) == 0.5 # values on parents don't override default on children assert wn.sense("test-en-example-n-0002-01").confidence() == 0.9 # check values on other elements assert wn.synset("test-en-0001-n").confidence() == 1.0 assert wn.synset("test-en-0001-n").definition(data=True).confidence() == 0.95 assert ( wn.synset("test-en-0001-n").relations(data=True).popitem()[0].confidence() ) == 0.8 assert wn.synset("test-en-0001-n").examples(data=True)[0].confidence() == 0.7 wn-1.0.0/tests/similarity_test.py000066400000000000000000000154321513755206300171110ustar00rootroot00000000000000from math import log import pytest import wn from wn import similarity as sim from wn.ic import information_content as infocont from wn.taxonomy import taxonomy_depth def get_synsets(w): return { "information": w.synset("test-en-0001-n"), "example": w.synset("test-en-0002-n"), "sample": w.synset("test-en-0004-n"), "random sample": w.synset("test-en-0005-n"), "random sample2": w.synset("test-en-0008-n"), "datum": w.synset("test-en-0006-n"), "exemplify": w.synset("test-en-0003-v"), } # some fake information content; computed using: # words = ['example', 'example', 'sample', 'random sample', 'illustrate'] # ic = compute(words, wn.Wordnet('test-en'), distribute_weight=False) ic = { "n": { "test-en-0001-n": 5.0, # information "test-en-0002-n": 5.0, # example, illustration "test-en-0004-n": 3.0, # sample "test-en-0005-n": 2.0, # random sample "test-en-0008-n": 2.0, # random sample 2 "test-en-0006-n": 1.0, # datum None: 6.0, }, "v": { "test-en-0003-v": 2.0, # exemplify, illustrate "test-en-0007-v": 1.0, # resignate None: 2.0, }, "a": {None: 1.0}, "r": {None: 1.0}, } @pytest.mark.usefixtures("mini_db") def test_path(): ss = get_synsets(wn.Wordnet("test-en")) assert sim.path(ss["information"], ss["information"]) == 1 / 1 assert sim.path(ss["information"], ss["example"]) == 1 / 2 assert sim.path(ss["information"], ss["sample"]) == 1 / 3 assert sim.path(ss["information"], ss["random sample"]) == 1 / 4 assert sim.path(ss["random sample"], ss["datum"]) == 1 / 5 assert sim.path(ss["random sample2"], ss["datum"]) == 0 assert sim.path(ss["random sample2"], ss["datum"], simulate_root=True) == 1 / 4 assert ( sim.path(ss["random sample"], ss["random sample2"], simulate_root=True) == 1 / 6 ) with pytest.raises(wn.Error): sim.path(ss["example"], ss["exemplify"]) with pytest.raises(wn.Error): sim.wup(ss["example"], ss["exemplify"], simulate_root=True) @pytest.mark.usefixtures("mini_db") def test_wup(): ss = get_synsets(wn.Wordnet("test-en")) assert sim.wup(ss["information"], ss["information"]) == (2 * 1) / (0 + 0 + 2 * 1) assert sim.wup(ss["information"], ss["example"]) == (2 * 1) / (0 + 1 + 2 * 1) assert sim.wup(ss["information"], ss["sample"]) == (2 * 1) / (0 + 2 + 2 * 1) assert sim.wup(ss["information"], ss["random sample"]) == (2 * 1) / (0 + 3 + 2 * 1) assert sim.wup(ss["random sample"], ss["datum"]) == (2 * 1) / (3 + 1 + 2 * 1) with pytest.raises(wn.Error): assert sim.wup(ss["random sample2"], ss["datum"]) assert sim.wup(ss["random sample2"], ss["datum"], simulate_root=True) == (2 * 1) / ( 1 + 2 + 2 * 1 ) assert sim.wup(ss["random sample"], ss["random sample2"], simulate_root=True) == ( 2 * 1 ) / (4 + 1 + 2 * 1) with pytest.raises(wn.Error): sim.wup(ss["example"], ss["exemplify"]) with pytest.raises(wn.Error): sim.wup(ss["example"], ss["exemplify"], simulate_root=True) @pytest.mark.usefixtures("mini_db") def test_lch(): w = wn.Wordnet("test-en") ss = get_synsets(w) d_n = taxonomy_depth(w, "n") assert sim.lch(ss["information"], ss["information"], d_n) == -log( (0 + 1) / (2 * d_n) ) assert sim.lch(ss["information"], ss["example"], d_n) == -log((1 + 1) / (2 * d_n)) assert sim.lch(ss["information"], ss["sample"], d_n) == -log((2 + 1) / (2 * d_n)) assert sim.lch(ss["information"], ss["random sample"], d_n) == -log( (3 + 1) / (2 * d_n) ) assert sim.lch(ss["random sample"], ss["datum"], d_n) == -log((4 + 1) / (2 * d_n)) with pytest.raises(wn.Error): assert sim.lch(ss["random sample2"], ss["datum"], d_n) assert sim.lch(ss["random sample2"], ss["datum"], d_n, simulate_root=True) == -log( (3 + 1) / (2 * d_n) ) assert sim.lch( ss["random sample"], ss["random sample2"], d_n, simulate_root=True ) == -log((5 + 1) / (2 * d_n)) with pytest.raises(wn.Error): sim.lch(ss["example"], ss["exemplify"], d_n) with pytest.raises(wn.Error): sim.lch(ss["example"], ss["exemplify"], d_n, simulate_root=True) @pytest.mark.usefixtures("mini_db") def test_res(): w = wn.Wordnet("test-en") ss = get_synsets(w) assert sim.res(ss["information"], ss["information"], ic) == infocont( ss["information"], ic ) assert sim.res(ss["information"], ss["example"], ic) == infocont( ss["information"], ic ) assert sim.res(ss["information"], ss["sample"], ic) == infocont( ss["information"], ic ) assert sim.res(ss["information"], ss["random sample"], ic) == infocont( ss["information"], ic ) assert sim.res(ss["random sample"], ss["datum"], ic) == infocont( ss["information"], ic ) with pytest.raises(wn.Error): sim.res(ss["random sample2"], ss["datum"], ic) with pytest.raises(wn.Error): sim.res(ss["example"], ss["exemplify"], ic) @pytest.mark.usefixtures("mini_db") def test_jcn(): w = wn.Wordnet("test-en") ss = get_synsets(w) info_ic = infocont(ss["information"], ic) assert sim.jcn(ss["information"], ss["information"], ic) == float("inf") assert sim.jcn(ss["information"], ss["example"], ic) == float("inf") assert sim.jcn(ss["information"], ss["sample"], ic) == 1 / ( (info_ic + infocont(ss["sample"], ic)) - 2 * info_ic ) assert sim.jcn(ss["information"], ss["random sample"], ic) == 1 / ( (info_ic + infocont(ss["random sample"], ic)) - 2 * info_ic ) assert sim.jcn(ss["random sample"], ss["datum"], ic) == 1 / ( (infocont(ss["random sample"], ic) + infocont(ss["datum"], ic)) - 2 * info_ic ) with pytest.raises(wn.Error): sim.jcn(ss["random sample2"], ss["datum"], ic) with pytest.raises(wn.Error): sim.jcn(ss["example"], ss["exemplify"], ic) @pytest.mark.usefixtures("mini_db") def test_lin(): w = wn.Wordnet("test-en") ss = get_synsets(w) info_ic = infocont(ss["information"], ic) assert sim.lin(ss["information"], ss["information"], ic) == 1.0 assert sim.lin(ss["information"], ss["example"], ic) == 1.0 assert sim.lin(ss["information"], ss["sample"], ic) == (2 * info_ic) / ( info_ic + infocont(ss["sample"], ic) ) assert sim.lin(ss["information"], ss["random sample"], ic) == (2 * info_ic) / ( info_ic + infocont(ss["random sample"], ic) ) assert sim.lin(ss["random sample"], ss["datum"], ic) == ( (2 * info_ic) / (infocont(ss["random sample"], ic) + infocont(ss["datum"], ic)) ) with pytest.raises(wn.Error): sim.lin(ss["random sample2"], ss["datum"], ic) with pytest.raises(wn.Error): sim.lin(ss["example"], ss["exemplify"], ic) wn-1.0.0/tests/taxonomy_test.py000066400000000000000000000077501513755206300166050ustar00rootroot00000000000000import pytest import wn from wn.taxonomy import ( hypernym_paths, leaves, max_depth, min_depth, roots, shortest_path, taxonomy_depth, ) @pytest.mark.usefixtures("mini_db") def test_roots(): en = wn.Wordnet("test-en") assert set(roots(en, pos="n")) == { en.synset("test-en-0001-n"), en.synset("test-en-0008-n"), } assert set(roots(en, pos="v")) == { en.synset("test-en-0003-v"), en.synset("test-en-0007-v"), } assert roots(en, pos="a") == [] assert set(roots(en)) == set(roots(en, pos="n") + roots(en, pos="v")) # with no expand relations and no relation of its own, every # synset looks like a root es = wn.Wordnet("test-es") assert set(roots(es, pos="n")) == { es.synset("test-es-0001-n"), es.synset("test-es-0002-n"), es.synset("test-es-0005-n"), } es = wn.Wordnet("test-es", expand="test-en") assert roots(es, pos="n") == [es.synset("test-es-0001-n")] @pytest.mark.usefixtures("mini_db") def test_leaves(): en = wn.Wordnet("test-en") assert set(leaves(en, pos="n")) == { en.synset("test-en-0005-n"), en.synset("test-en-0006-n"), en.synset("test-en-0008-n"), } assert set(leaves(en, pos="v")) == { en.synset("test-en-0003-v"), en.synset("test-en-0007-v"), } @pytest.mark.usefixtures("mini_db") def test_taxonomy_depth(): en = wn.Wordnet("test-en") assert taxonomy_depth(en, pos="n") == 3 assert taxonomy_depth(en, pos="v") == 0 @pytest.mark.usefixtures("mini_db") def test_hypernym_paths(): information = wn.synsets("information")[0] example = wn.synsets("example")[0] sample = wn.synsets("sample")[0] random_sample = wn.synsets("random sample")[0] assert hypernym_paths(information) == [] assert hypernym_paths(example) == [[information]] assert hypernym_paths(sample) == [[example, information]] assert hypernym_paths(random_sample) == [[sample, example, information]] @pytest.mark.usefixtures("mini_db") def test_interlingual_hypernym_paths(): información = wn.synsets("información")[0] ejemplo = wn.synsets("ejemplo")[0] sample = wn.synsets("sample", lexicon="test-en:1")[0] inferred = wn.Synset.empty("*INFERRED*", ili=sample.ili, _lexicon="test-es:1") muestra_aleatoria = wn.synsets("muestra aleatoria")[0] assert hypernym_paths(información) == [] assert hypernym_paths(ejemplo) == [[información]] assert hypernym_paths(muestra_aleatoria) == [[inferred, ejemplo, información]] @pytest.mark.usefixtures("mini_db") def test_shortest_path(): information = wn.synsets("information")[0] example = wn.synsets("example")[0] sample = wn.synsets("sample")[0] random_sample = wn.synsets("random sample")[0] datum = wn.synsets("datum")[0] exemplify = wn.synsets("exemplify")[0] inferred_root = wn.Synset.empty("*ROOT*", _lexicon="test-en:1") assert shortest_path(information, information) == [] assert shortest_path(information, datum) == [datum] assert shortest_path(information, sample) == [example, sample] assert shortest_path(sample, information) == [example, information] assert shortest_path(random_sample, datum) == [sample, example, information, datum] with pytest.raises(wn.Error): shortest_path(example, exemplify) assert shortest_path(example, exemplify, simulate_root=True) == [ information, inferred_root, exemplify, ] @pytest.mark.usefixtures("mini_db") def test_min_depth(): assert min_depth(wn.synsets("information")[0]) == 0 assert min_depth(wn.synsets("example")[0]) == 1 assert min_depth(wn.synsets("sample")[0]) == 2 assert min_depth(wn.synsets("random sample")[0]) == 3 @pytest.mark.usefixtures("mini_db") def test_max_depth(): assert max_depth(wn.synsets("information")[0]) == 0 assert max_depth(wn.synsets("example")[0]) == 1 assert max_depth(wn.synsets("sample")[0]) == 2 assert max_depth(wn.synsets("random sample")[0]) == 3 wn-1.0.0/tests/util_test.py000066400000000000000000000005501513755206300156730ustar00rootroot00000000000000from wn import util def test_synset_id_formatter(): f = util.synset_id_formatter assert f()(prefix="xyz", offset=123, pos="n") == "xyz-00000123-n" assert f(prefix="xyz")(offset=123, pos="n") == "xyz-00000123-n" assert f(prefix="xyz", pos="n")(offset=123) == "xyz-00000123-n" assert f("abc-{offset}-{pos}")(offset=1, pos="v") == "abc-1-v" wn-1.0.0/tests/validate_test.py000066400000000000000000000011201513755206300165010ustar00rootroot00000000000000import pytest from wn import lmf from wn.validate import validate tests = [ ("E101", 0), ("E101", 1), ("E101", 2), ("E101", 3), ("W305", 0), ("W306", 0), ("W307", 0), ] test_ids = [f"{code}-{i}" for code, i in tests] @pytest.mark.parametrize(("code", "i"), tests, ids=test_ids) def test_validate(datadir, code: str, i: int) -> None: path = datadir / f"{code}-{i}.xml" lex = lmf.load(path, progress_handler=None)["lexicons"][0] report = validate(lex, select=[code], progress_handler=None) print(report) assert len(report[code]["items"]) > 0 wn-1.0.0/tests/wordnet_test.py000066400000000000000000000064261513755206300164100ustar00rootroot00000000000000from pathlib import Path import pytest import wn @pytest.mark.usefixtures("mini_db_1_1") def test_wordnet_lexicons(): en = wn.Wordnet("test-en") assert len(en.lexicons()) == 1 assert len(en.expanded_lexicons()) == 0 en1 = wn.Wordnet("test-en:1") assert en.lexicons() == en1.lexicons() assert en.expanded_lexicons() == en1.expanded_lexicons() en2 = wn.Wordnet(lang="en") assert len(en2.lexicons()) == 2 assert len(en2.expanded_lexicons()) == 0 es = wn.Wordnet("test-es") assert len(es.lexicons()) == 1 assert len(es.expanded_lexicons()) == 0 es2 = wn.Wordnet("test-es", expand="test-en") assert len(es2.lexicons()) == 1 assert len(es2.expanded_lexicons()) == 1 ja = wn.Wordnet("test-ja") assert len(ja.lexicons()) == 1 assert len(ja.expanded_lexicons()) == 1 ja2 = wn.Wordnet("test-ja", expand="") assert len(ja2.lexicons()) == 1 assert len(ja2.expanded_lexicons()) == 0 @pytest.mark.usefixtures("mini_db") def test_wordnet_normalize(): es = wn.Wordnet("test-es") assert es.words("Informacion") == es.words("información") assert es.words("ínfórmácíón") == es.words("información") es = wn.Wordnet("test-es", normalizer=None) assert es.words("informacion") == [] assert es.words("Información") == [] # The following doesn't necessarily work because any non-None # normalizer causes the normalized form column to be tested with # the original form # es = wn.Wordnet('test-es', normalizer=str.lower) # assert es.words('informacion') == [] # assert es.words('Información') == es.words('información') @pytest.mark.usefixtures("mini_db") def test_wordnet_lemmatize(): # default lemmatizer compares alternative forms en = wn.Wordnet("test-en") assert en.words("examples") == [] assert en.words("exemplifying") == en.words("exemplify") assert en.words("data") == en.words("datum") en = wn.Wordnet("test-en", search_all_forms=False) assert en.words("examples") == [] assert en.words("exemplifying") == [] assert en.words("data") == [] def morphy_lite(form, pos): result = {pos: {form}} if pos in ("n", None) and form.endswith("s"): result.setdefault("n", set()).add(form[:-1]) return result en = wn.Wordnet("test-en", lemmatizer=morphy_lite, search_all_forms=False) assert en.words("examples", pos="n") == en.words("example") assert en.words("examples") == en.words("example") assert en.words("exemplifying") == [] assert en.words("data") == [] en = wn.Wordnet("test-en", lemmatizer=morphy_lite, search_all_forms=True) assert en.words("data") == en.words("datum") assert en.words("exemplifying") == en.words("exemplify") def test_portable_entities_issue_226(monkeypatch, tmp_path, datadir): dir = tmp_path / "wn_issue_226" with monkeypatch.context() as m: m.setattr(wn.config, "data_directory", Path(dir)) wn.add(datadir / "mini-lmf-1.0.xml") en = wn.Wordnet("test-en") info1 = en.synsets("information")[0] wn.remove("test-en") wn.add(datadir / "mini-lmf-1.0.xml") info2 = en.synsets("information")[0] # en Wordnet object still works assert info1 == info2 # synsets are equivalent wn._db.clear_connections() wn-1.0.0/wn/000077500000000000000000000000001513755206300125675ustar00rootroot00000000000000wn-1.0.0/wn/__init__.py000066400000000000000000000023411513755206300147000ustar00rootroot00000000000000""" Wordnet Interface. """ __all__ = ( "ConfigurationError", "Count", "DatabaseError", "Definition", "Error", "Example", "Form", "Lexicon", "ProjectError", "Pronunciation", "Relation", "Sense", "Synset", "Tag", "WnWarning", "Word", "Wordnet", "__version__", "add", "add_lexical_resource", "download", "export", "lemmas", "lexicons", "projects", "remove", "reset_database", "sense", "senses", "synset", "synsets", "word", "words", ) from wn._add import add, add_lexical_resource, remove from wn._config import config # noqa: F401 from wn._core import ( Count, Definition, Example, Form, Pronunciation, Relation, Sense, Synset, Tag, Word, ) from wn._download import download from wn._exceptions import ( ConfigurationError, DatabaseError, Error, ProjectError, WnWarning, ) from wn._export import export from wn._lexicon import Lexicon from wn._module_functions import ( lemmas, lexicons, projects, reset_database, sense, senses, synset, synsets, word, words, ) from wn._wordnet import Wordnet __version__ = "1.0.0" wn-1.0.0/wn/__main__.py000066400000000000000000000120231513755206300146570ustar00rootroot00000000000000import argparse import json import logging import sys from pathlib import Path import wn from wn import lmf from wn._util import format_lexicon_specifier from wn.project import iterpackages from wn.validate import validate def _download(args): if args.index: wn.config.load_index(args.index) for target in args.target: wn.download(target, add=args.add) def _lexicons(args): for lex in wn.lexicons(lang=args.lang, lexicon=args.lexicon): print("\t".join((lex.id, lex.version, f"[{lex.language}]", lex.label))) def _projects(args): for info in wn.projects(): key = "i" key += "c" if info["cache"] else "-" # key += 'a' if False else '-' # TODO: check if project is added to db print( "\t".join( ( key, info["id"], info["version"], f"[{info['language'] or '---'}]", info["label"] or "---", ) ) ) def _validate(args): all_valid = True selectseq = [check.strip() for check in args.select.split(",")] for package in iterpackages(args.FILE): resource = lmf.load(package.resource_file()) for lexicon in resource["lexicons"]: spec = format_lexicon_specifier(lexicon["id"], lexicon["version"]) print(f"{spec:<20}", end="") report = validate(lexicon, select=selectseq) if not any(check.get("items", []) for check in report.values()): print("passed") else: print("failed") all_valid = False # clean up report for code in list(report): if not report[code].get("items"): del report[code] if args.output_file: with open(args.output_file, "w") as outfile: json.dump(report, outfile, indent=2) else: for _code, check in report.items(): if not check["items"]: continue print(f" {check['message']}") for id, context in check["items"].items(): print(f" {id}: {context}" if context else f" {id}") sys.exit(0 if all_valid else 1) def _path_type(arg): return Path(arg) def _file_path_type(arg): path = Path(arg) if not path.is_file(): raise argparse.ArgumentTypeError(f"cannot file file: {arg}") return path parser = argparse.ArgumentParser( prog="python3 -m wn", description="Manage Wn's wordnet data from the command line.", ) parser.add_argument("-V", "--version", action="version", version=f"Wn {wn.__version__}") parser.add_argument( "-v", "--verbose", action="count", dest="verbosity", default=0, help="increase verbosity (can repeat: -vv, -vvv)", ) parser.add_argument( "-d", "--dir", type=_path_type, help="data directory for Wn's database and cache", ) parser.set_defaults(func=lambda _: parser.print_help()) sub_parsers = parser.add_subparsers(title="subcommands") parser_download = sub_parsers.add_parser( "download", description="Download wordnets and add them to Wn's database.", help="download wordnets", ) parser_download.add_argument("target", nargs="+", help="project specifiers or URLs") parser_download.add_argument( "--index", type=_file_path_type, help="project index to use for downloading" ) parser_download.add_argument( "--no-add", action="store_false", dest="add", help="download and cache without adding to the database", ) parser_download.set_defaults(func=_download) parser_lexicons = sub_parsers.add_parser( "lexicons", description="Display a list of installed lexicons.", help="list installed lexicons", ) parser_lexicons.add_argument("-l", "--lang", help="BCP 47 language code") parser_lexicons.add_argument("--lexicon", help="lexicon specifiers") parser_lexicons.set_defaults(func=_lexicons) parser_projects = sub_parsers.add_parser( "projects", description=( "Display a list of known projects. The first column shows the " "status for a project (i=indexed, c=cached)." ), help="list known projects", ) parser_projects.set_defaults(func=_projects) parser_validate = sub_parsers.add_parser( "validate", description=("Validate a WN-LMF lexicon"), help="validate a lexicon", ) parser_validate.add_argument( "FILE", type=_file_path_type, help="WN-LMF (XML) lexicon file to validate" ) parser_validate.add_argument( "--select", metavar="CHECKS", default="E,W", help="comma-separated list of checks to run (default: E,W)", ) parser_validate.add_argument( "--output-file", metavar="FILE", help="write report to a JSON file" ) parser_validate.set_defaults(func=_validate) args = parser.parse_args() logging.basicConfig(level=logging.ERROR - (min(args.verbosity, 3) * 10)) if args.dir: wn.config.data_directory = args.dir args.func(args) wn-1.0.0/wn/_add.py000066400000000000000000001113501513755206300140310ustar00rootroot00000000000000""" Adding and removing lexicons to/from the database. """ import logging import sqlite3 from collections.abc import Iterable, Iterator, Sequence from itertools import islice from pathlib import Path from typing import TypeVar, cast from wn import constants, lmf from wn import ili as _ili from wn._config import config from wn._db import connect from wn._exceptions import Error from wn._queries import ( get_lexicon_extensions, resolve_lexicon_specifiers, ) from wn._types import AnyPath from wn._util import format_lexicon_specifier, normalize_form from wn.project import iterpackages from wn.util import ProgressBar, ProgressHandler log = logging.getLogger("wn") BATCH_SIZE = 1000 DEFAULT_MEMBER_RANK = 127 # synset member rank when not specified by 'members' ENTRY_QUERY = """ SELECT e.rowid FROM entries AS e WHERE e.id = ? AND e.lexicon_rowid = ? """ # forms don't have reliable ids, so also consider rank; this depends # on each form having a unique rank, and this doesn't work for lexicon # extensions FORM_QUERY = """ SELECT f.rowid FROM forms AS f JOIN entries AS e ON f.entry_rowid = e.rowid WHERE e.id = ? AND e.lexicon_rowid = ? AND (f.id = ? OR f.rank = ?) """ SENSE_QUERY = """ SELECT s.rowid FROM senses AS s WHERE s.id = ? AND s.lexicon_rowid = ? """ SYNSET_QUERY = """ SELECT ss.rowid FROM synsets AS ss WHERE ss.id = ? AND ss.lexicon_rowid = ? """ RELTYPE_QUERY = """ SELECT rt.rowid FROM relation_types AS rt WHERE rt.type = ? """ ILISTAT_QUERY = """ SELECT ist.rowid FROM ili_statuses AS ist WHERE ist.status = ? """ LEXFILE_QUERY = """ SELECT lf.rowid FROM lexfiles AS lf WHERE lf.name = ? """ _AnyLexicon = lmf.Lexicon | lmf.LexiconExtension _AnyEntry = lmf.LexicalEntry | lmf.ExternalLexicalEntry _AnyLemma = lmf.Lemma | lmf.ExternalLemma _AnyForm = lmf.Form | lmf.ExternalForm _AnySense = lmf.Sense | lmf.ExternalSense _AnySynset = lmf.Synset | lmf.ExternalSynset def add( source: AnyPath, progress_handler: type[ProgressHandler] | None = ProgressBar, ) -> None: """Add the LMF or ILI file at *source* to the database. The file at *source* may be gzip-compressed or plain text file. >>> wn.add("english-wordnet-2020.xml") Added ewn:2020 (English WordNet) The *progress_handler* parameter takes a subclass of :class:`wn.util.ProgressHandler`. An instance of the class will be created, used, and closed by this function. """ if progress_handler is None: progress_handler = ProgressHandler progress = progress_handler(message="Database") log.info("adding project to database") log.info(" database: %s", config.database_path) log.info(" project file: %s", source) try: for package in iterpackages(source): match package.type: case constants._WORDNET: _add_lmf(package.resource_file(), progress, progress_handler) case constants._ILI: _add_ili(package.resource_file(), progress) case _: raise Error(f"unknown package type: {package.type}") finally: progress.close() def _add_lmf( source: Path, progress: ProgressHandler, progress_handler: type[ProgressHandler], ) -> None: # abort if lexicons in *source* are already added progress.flash(f"Checking {source!s}") infos = lmf.scan_lexicons(source) if not infos: progress.flash(f"{source}: No lexicons found") return skipmap = _precheck(infos, progress) if all(skipmap.values()): return # nothing to do # all clear, try to add them progress.flash(f"Reading {source!s}") resource = lmf.load(source, progress_handler) _add_lexical_resource(resource, skipmap, progress) def add_lexical_resource( resource: lmf.LexicalResource, progress_handler: type[ProgressHandler] | None = ProgressBar, ) -> None: """Add the lexical resource *resource* to the database. The *resource* argument is an in-memory lexical resource as from :func:`wn.lmf.load` and not a file on disk. >>> resource = wn.lmf.load("english-wordnet-2024.xml") >>> wn.add_lexical_resource(resource) Added ewn:2020 (English WordNet) The *progress_handler* parameter takes a subclass of :class:`wn.util.ProgressHandler`. An instance of the class will be created, used, and closed by this function. """ if progress_handler is None: progress_handler = ProgressHandler progress = progress_handler(message="Database") try: progress.flash("Checking resource") if not resource["lexicons"]: progress.flash("No lexicons found") return skipmap = _precheck(resource["lexicons"], progress) if all(skipmap.values()): return # nothing to do _add_lexical_resource(resource, skipmap, progress) finally: progress.close() def _add_lexical_resource( resource: lmf.LexicalResource, skipmap: dict[str, bool], progress: ProgressHandler, ) -> None: with connect() as conn: cur = conn.cursor() # these two settings increase the risk of database corruption # if the system crashes during a write, but they should also # make inserts much faster cur.execute("PRAGMA synchronous = OFF") cur.execute("PRAGMA journal_mode = MEMORY") for lexicon in resource["lexicons"]: spec = format_lexicon_specifier(lexicon["id"], lexicon["version"]) if skipmap[spec]: continue # _precheck() says this should be skipped progress.flash("Updating lookup tables") _update_lookup_tables(lexicon, cur) progress.set(count=0, total=_sum_counts(lexicon)) synsets: Sequence[_AnySynset] = _synsets(lexicon) entries: Sequence[_AnyEntry] = _entries(lexicon) synbhrs: Sequence[lmf.SyntacticBehaviour] = _collect_frames(lexicon) lexid, baseid = _insert_lexicon(lexicon, cur, progress) lexidmap = _build_lexid_map(lexicon, lexid, baseid) _insert_synsets(synsets, lexid, cur, progress) _insert_entries(entries, lexid, cur, progress) _insert_index(entries, lexid, cur, progress) _insert_forms(entries, lexid, lexidmap, cur, progress) _insert_pronunciations(entries, lexid, lexidmap, cur, progress) _insert_tags(entries, lexid, lexidmap, cur, progress) _insert_senses(entries, synsets, lexid, lexidmap, cur, progress) _insert_adjpositions(entries, lexid, lexidmap, cur, progress) _insert_counts(entries, lexid, lexidmap, cur, progress) _insert_syntactic_behaviours(synbhrs, lexid, lexidmap, cur, progress) _insert_synset_relations(synsets, lexid, lexidmap, cur, progress) _insert_sense_relations(lexicon, lexid, lexidmap, cur, progress) _insert_synset_definitions(synsets, lexid, lexidmap, cur, progress) _insert_examples( [sense for e in entries for sense in _senses(e)], lexid, lexidmap, "sense_examples", cur, progress, ) _insert_examples(synsets, lexid, lexidmap, "synset_examples", cur, progress) progress.set(status="") # clear type string progress.flash(f"Added {spec} ({lexicon['label']})\n") def _precheck( infos: Sequence[lmf.ScanInfo | lmf.Lexicon | lmf.LexiconExtension], progress: ProgressHandler, ) -> dict[str, bool]: skipmap: dict[str, bool] = {} lexqry = "SELECT * FROM lexicons WHERE id = :id AND version = :version" with connect() as conn: cur = conn.cursor() for info in infos: key = format_lexicon_specifier(info["id"], info["version"]) base: lmf.LexiconSpecifier | None = info.get("extends") # type: ignore skipmap[key] = False reason = "" # can't have two lexicons with the same specifier in the db if cur.execute(lexqry, info).fetchone(): skipmap[key] = True reason = "already added" # can't have an extension without the base elif base and cur.execute(lexqry, base).fetchone() is None: skipmap[key] = True base_key = format_lexicon_specifier(base["id"], base["version"]) reason = f"base lexicon ({base_key}) not available" if reason: progress.flash(f"Skipping {key} ({info['label']}); {reason}\n") return skipmap def _sum_counts(lex: _AnyLexicon) -> int: ents = _entries(lex) locs = _local_entries(ents) lems = [e["lemma"] for e in locs if e.get("lemma")] frms = [f for e in ents for f in _forms(e)] sens = [s for e in ents for s in _senses(e)] syns = _synsets(lex) return sum( [ # index (every entry must be processed; not all use index) len(ents), # lexical entries len(ents), len(lems), sum(len(lem.get("pronunciations", [])) for lem in lems), sum(len(lem.get("tags", [])) for lem in lems), len(frms), sum(len(frm.get("pronunciations", [])) for frm in frms), sum(len(frm.get("tags", [])) for frm in frms), # senses len(sens), sum(len(sen.get("relations", [])) for sen in sens), sum(len(sen.get("examples", [])) for sen in sens), sum(len(sen.get("counts", [])) for sen in sens), # synsets len(syns), sum(len(syn.get("definitions", [])) for syn in syns), sum(len(syn.get("relations", [])) for syn in syns), sum(len(syn.get("examples", [])) for syn in syns), # syntactic behaviours sum(len(ent.get("frames", [])) for ent in locs), len(lex.get("frames", [])), ] ) def _update_lookup_tables(lexicon: _AnyLexicon, cur: sqlite3.Cursor) -> None: reltypes = { rel["relType"] for ss in _synsets(lexicon) for rel in ss.get("relations", []) } reltypes.update( rel["relType"] for e in _entries(lexicon) for s in _senses(e) for rel in s.get("relations", []) ) cur.executemany( "INSERT OR IGNORE INTO relation_types VALUES (null,?)", [(rt,) for rt in sorted(reltypes)], ) lexfiles: set[str] = { ss.get("lexfile", "") for ss in _local_synsets(_synsets(lexicon)) if ss.get("lexfile") } cur.executemany( "INSERT OR IGNORE INTO lexfiles VALUES (null,?)", [(lf,) for lf in sorted(lexfiles)], ) def _insert_lexicon( lexicon: _AnyLexicon, cur: sqlite3.Cursor, progress: ProgressHandler ) -> tuple[int, int]: progress.set(status="Lexicon Info") cur.execute( "INSERT INTO lexicons VALUES (null,?,?,?,?,?,?,?,?,?,?,?,?)", ( f"{lexicon['id']}:{lexicon['version']}", lexicon["id"], lexicon["label"], lexicon["language"], lexicon["email"], lexicon["license"], lexicon["version"], lexicon.get("url"), lexicon.get("citation"), lexicon.get("logo"), lexicon.get("meta"), False, ), ) lexid = cur.lastrowid if not isinstance(lexid, int): raise Error("failed to insert lexicon") query = """ UPDATE lexicon_dependencies SET provider_rowid = ? WHERE provider_id = ? AND provider_version = ? """ cur.execute(query, (lexid, lexicon["id"], lexicon["version"])) query = """ INSERT INTO {table} VALUES (:lid, :id, :version, :url, (SELECT rowid FROM lexicons WHERE id=:id AND version=:version)) """ params = [] for dep in lexicon.get("requires", []): param_dict = dict(dep) param_dict.setdefault("url", None) param_dict["lid"] = lexid params.append(param_dict) if params: cur.executemany(query.format(table="lexicon_dependencies"), params) if lexicon.get("extends"): lexicon = cast("lmf.LexiconExtension", lexicon) param_dict = dict(lexicon["extends"]) param_dict.setdefault("url", None) param_dict["lid"] = lexid cur.execute(query.format(table="lexicon_extensions"), param_dict) baseid = cur.execute( "SELECT rowid FROM lexicons WHERE id=? AND version=?", (param_dict["id"], param_dict["version"]), ).fetchone()[0] else: baseid = lexid return lexid, baseid _LexIdMap = dict[str, int] def _build_lexid_map(lexicon: _AnyLexicon, lexid: int, extid: int) -> _LexIdMap: """Build a mapping of entity IDs to extended lexicon rowid.""" lexidmap: _LexIdMap = {} if lexid != extid: lexidmap.update((e["id"], extid) for e in _entries(lexicon) if _is_external(e)) lexidmap.update( (s["id"], extid) for e in _entries(lexicon) for s in _senses(e) if _is_external(s) ) lexidmap.update( (ss["id"], extid) for ss in _synsets(lexicon) if _is_external(ss) ) return lexidmap T = TypeVar("T") def _batch(sequence: Iterable[T]) -> Iterator[list[T]]: it = iter(sequence) batch = list(islice(it, 0, BATCH_SIZE)) while len(batch): yield batch batch = list(islice(it, 0, BATCH_SIZE)) def _insert_synsets( synsets: Sequence[_AnySynset], lexid: int, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Synsets") # synsets ss_query = f""" INSERT INTO synsets VALUES (null,?,?,(SELECT rowid FROM ilis WHERE id=?),?,({LEXFILE_QUERY}),?) """ # presupposed ILIs pre_ili_query = f""" INSERT OR IGNORE INTO ilis VALUES (null,?,({ILISTAT_QUERY}),?,?) """ # proposed ILIs pro_ili_query = """ INSERT INTO proposed_ilis VALUES (null, (SELECT ss.rowid FROM synsets AS ss WHERE ss.id=? AND lexicon_rowid=?), ?, ?) """ for batch in _batch(_local_synsets(synsets)): # first add presupposed ILIs pre_ili_data = [] for ss in batch: ili = ss["ili"] if ili and ili != "in": defn = ss.get("ili_definition") # normally null text = defn["text"] if defn else None meta = defn.get("meta") if defn else None pre_ili_data.append((ili, "presupposed", text, meta)) cur.executemany(pre_ili_query, pre_ili_data) # then add synsets ss_data = ( ( ss["id"], lexid, ss["ili"] if ss["ili"] and ss["ili"] != "in" else None, ss.get("partOfSpeech"), ss.get("lexfile"), ss.get("meta"), ) for ss in batch ) cur.executemany(ss_query, ss_data) # finally add proposed ILIs pro_ili_data = [] for ss in batch: ili = ss["ili"] if ili == "in": defn = ss.get("ili_definition") text = defn["text"] if defn else None meta = defn.get("meta") if defn else None pro_ili_data.append((ss["id"], lexid, text, meta)) cur.executemany(pro_ili_query, pro_ili_data) progress.update(len(batch)) # only store when lexicalized=False unlexicalized_data = [ (synset["id"], lexid) for synset in _local_synsets(synsets) if not synset.get("lexicalized", True) ] query = f""" INSERT INTO unlexicalized_synsets (synset_rowid) {SYNSET_QUERY} """ cur.executemany(query, unlexicalized_data) def _insert_synset_definitions( synsets: Sequence[_AnySynset], lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Definitions") query = f""" INSERT INTO definitions VALUES (null,?,({SYNSET_QUERY}),?,?,({SENSE_QUERY}),?) """ for batch in _batch(synsets): data = [ ( lexid, synset["id"], lexidmap.get(synset["id"], lexid), definition["text"], definition.get("language"), definition.get("sourceSense"), lexidmap.get(definition.get("sourceSense", ""), lexid), definition.get("meta"), ) for synset in batch for definition in synset.get("definitions", []) ] cur.executemany(query, data) progress.update(len(data)) def _insert_synset_relations( synsets: Sequence[_AnySynset], lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Synset Relations") query = f""" INSERT INTO synset_relations VALUES (null,?,({SYNSET_QUERY}),({SYNSET_QUERY}),({RELTYPE_QUERY}),?) """ for batch in _batch(synsets): data = [ ( lexid, synset["id"], lexidmap.get(synset["id"], lexid), relation["target"], lexidmap.get(relation["target"], lexid), relation["relType"], relation.get("meta"), ) for synset in batch for relation in synset.get("relations", []) ] cur.executemany(query, data) progress.update(len(data)) def _insert_entries( entries: Sequence[_AnyEntry], lexid: int, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Words") query = "INSERT INTO entries VALUES (null,?,?,?,?)" for batch in _batch(_local_entries(entries)): data = ( (entry["id"], lexid, entry["lemma"]["partOfSpeech"], entry.get("meta")) for entry in batch ) cur.executemany(query, data) progress.update(len(batch)) def _insert_index( entries: Sequence[_AnyEntry], lexid: int, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Index") query = f"INSERT INTO entry_index VALUES (({ENTRY_QUERY}),?)" for batch in _batch(_local_entries(entries)): data = ( ( entry["id"], lexid, entry["index"], ) for entry in batch if entry.get("index") ) cur.executemany(query, data) progress.update(len(batch)) def _insert_forms( entries: Sequence[_AnyEntry], lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Word Forms") query = f"INSERT INTO forms VALUES (null,?,?,({ENTRY_QUERY}),?,?,?,?)" for batch in _batch(entries): forms: list[ tuple[str | None, int, str, int, str, str | None, str | None, int] ] = [] for entry in batch: eid = entry["id"] lid = lexidmap.get(eid, lexid) if not _is_external(entry): entry = cast("lmf.LexicalEntry", entry) written_form = entry["lemma"]["writtenForm"] norm = normalize_form(written_form) forms.append( ( None, lexid, eid, lid, written_form, norm if norm != written_form else None, entry["lemma"].get("script"), 0, ) ) for i, form in enumerate(_forms(entry), 1): if _is_external(form): continue form = cast("lmf.Form", form) written_form = form["writtenForm"] norm = normalize_form(written_form) forms.append( ( form.get("id"), lexid, eid, lid, written_form, norm if norm != written_form else None, form.get("script"), i, ) ) cur.executemany(query, forms) progress.update(len(forms)) def _insert_pronunciations( entries: Sequence[_AnyEntry], lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Pronunciations") query = f"INSERT INTO pronunciations VALUES (({FORM_QUERY}),?,?,?,?,?,?)" for batch in _batch(entries): prons: list[ tuple[ # FORM_QUERY args str, # entry id int, # entry lexid str | None, # optional form id int, # rank # pronunciation fields int, # pronunciation lexid str, # text str | None, # variety str | None, # notation bool, # phonemic str | None, # audio ] ] = [] for entry in batch: eid = entry["id"] lid = lexidmap.get(eid, lexid) if lemma := entry.get("lemma"): for p in lemma.get("pronunciations", []): prons.append( ( eid, lid, None, 0, lexid, p["text"], p.get("variety"), p.get("notation"), p.get("phonemic", True), p.get("audio"), ) ) for i, form in enumerate(_forms(entry), 1): # rank is not valid in FORM_QUERY for external forms rank = -1 if _is_external(form) else i for p in form.get("pronunciations", []): prons.append( ( eid, lid, form.get("id"), rank, lexid, p["text"], p.get("variety"), p.get("notation"), p.get("phonemic", True), p.get("audio"), ) ) cur.executemany(query, prons) progress.update(len(prons)) def _insert_tags( entries: Sequence[_AnyEntry], lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Word Form Tags") query = f"INSERT INTO tags VALUES (({FORM_QUERY}),?,?,?)" for batch in _batch(entries): tags: list[tuple[str, int, str | None, int, int, str, str]] = [] for entry in batch: eid = entry["id"] lid = lexidmap.get(eid, lexid) if lemma := entry.get("lemma"): for tag in lemma.get("tags", []): tags.append( ( eid, lid, None, 0, lexid, tag["text"], tag["category"], ) ) for i, form in enumerate(_forms(entry), 1): # rank is not valid in FORM_QUERY for external forms rank = -1 if _is_external(form) else i for tag in form.get("tags", []): tags.append( ( eid, lid, form.get("id"), rank, lexid, tag["text"], tag["category"], ) ) cur.executemany(query, tags) progress.update(len(tags)) def _insert_senses( entries: Sequence[_AnyEntry], synsets: Sequence[_AnySynset], lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Senses") ssrank = { (ss["id"], _id): i for ss in _local_synsets(synsets) for i, _id in enumerate(ss.get("members", [])) } query = f""" INSERT INTO senses VALUES (null, ?, ?, ({ENTRY_QUERY}), ?, ({SYNSET_QUERY}), ?, ?) """ for batch in _batch(entries): data = [ ( sense["id"], lexid, entry["id"], lexidmap.get(entry["id"], lexid), sense.get("n", i), sense["synset"], lexidmap.get(sense["synset"], lexid), # members can be sense or entry IDs ssrank.get( (sense["synset"], sense["id"]), ssrank.get((sense["synset"], entry["id"]), DEFAULT_MEMBER_RANK), ), sense.get("meta"), ) for entry in batch for i, sense in enumerate(_local_senses(_senses(entry)), 1) ] cur.executemany(query, data) progress.update(len(data)) # only store when lexicalized=False unlexicalized_data = [ (sense["id"], lexid) for entry in entries for sense in _local_senses(_senses(entry)) if not sense.get("lexicalized", True) ] query = f""" INSERT INTO unlexicalized_senses (sense_rowid) {SENSE_QUERY} """ cur.executemany(query, unlexicalized_data) def _insert_adjpositions( entries: Sequence[_AnyEntry], lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ): progress.set(status="Sense Adjpositions") data = [ (s["id"], lexidmap.get(s["id"], lexid), s["adjposition"]) for e in entries for s in _local_senses(_senses(e)) if s.get("adjposition") ] query = f"INSERT INTO adjpositions VALUES (({SENSE_QUERY}),?)" cur.executemany(query, data) def _insert_counts( entries: Sequence[_AnyEntry], lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Counts") data = [ ( lexid, sense["id"], lexidmap.get(sense["id"], lexid), count["value"], count.get("meta"), ) for entry in entries for sense in _senses(entry) for count in sense.get("counts", []) ] query = f"INSERT INTO counts VALUES (null,?,({SENSE_QUERY}),?,?)" cur.executemany(query, data) progress.update(len(data)) def _collect_frames(lexicon: _AnyLexicon) -> list[lmf.SyntacticBehaviour]: # WN-LMF 1.0 syntactic behaviours are on lexical entries, and in # WN-LMF 1.1 they are at the lexticon level with IDs. This # function normalizes the two variants. # IDs are not required and frame strings must be unique in a # lexicon, so lookup syntactic behaviours by the frame string synbhrs: dict[str, lmf.SyntacticBehaviour] = { frame["subcategorizationFrame"]: lmf.SyntacticBehaviour( id=frame["id"], subcategorizationFrame=frame["subcategorizationFrame"], senses=frame.get("senses", []), ) for frame in lexicon.get("frames", []) } # all relevant senses are collected into the 'senses' key id_senses_map = {sb["id"]: sb["senses"] for sb in synbhrs.values() if sb.get("id")} for entry in _entries(lexicon): # for WN-LMF 1.1 for sense in _local_senses(_senses(entry)): for sbid in sense.get("subcat", []): id_senses_map[sbid].append(sense["id"]) # for WN-LMF 1.0 if _is_external(entry) or not entry.get("frames"): continue entry = cast("lmf.LexicalEntry", entry) all_senses = [s["id"] for s in _senses(entry)] for frame in entry.get("frames", []): subcat_frame = frame["subcategorizationFrame"] if subcat_frame not in synbhrs: synbhrs[subcat_frame] = lmf.SyntacticBehaviour( subcategorizationFrame=subcat_frame, senses=[], ) senses = frame.get("senses", []) or all_senses synbhrs[subcat_frame]["senses"].extend(senses) return list(synbhrs.values()) def _insert_syntactic_behaviours( synbhrs: Sequence[lmf.SyntacticBehaviour], lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Syntactic Behaviours") query = "INSERT INTO syntactic_behaviours VALUES (null,?,?,?)" sbdata = [ (sb.get("id") or None, lexid, sb["subcategorizationFrame"]) for sb in synbhrs ] cur.executemany(query, sbdata) # syntactic behaviours don't have a required ID; index on frame framemap: dict[str, list[str]] = { sb["subcategorizationFrame"]: sb.get("senses", []) for sb in synbhrs } query = f""" INSERT INTO syntactic_behaviour_senses VALUES ((SELECT rowid FROM syntactic_behaviours WHERE lexicon_rowid=? AND frame=?), ({SENSE_QUERY})) """ sbsdata = [ (lexid, frame, sid, lexidmap.get(sid, lexid)) for frame in framemap for sid in framemap[frame] ] cur.executemany(query, sbsdata) progress.update(len(synbhrs)) def _insert_sense_relations( lexicon: _AnyLexicon, lexid: int, lexidmap: _LexIdMap, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Sense Relations") # need to separate relations into those targeting senses vs synsets synset_ids = {ss["id"] for ss in _synsets(lexicon)} sense_ids = {s["id"] for e in _entries(lexicon) for s in _senses(e)} s_s_rels = [] s_ss_rels = [] for entry in _entries(lexicon): for sense in _senses(entry): slid = lexidmap.get(sense["id"], lexid) for relation in sense.get("relations", []): target_id = relation["target"] tlid = lexidmap.get(target_id, lexid) if target_id in sense_ids: s_s_rels.append((sense["id"], slid, tlid, relation)) elif target_id in synset_ids: s_ss_rels.append((sense["id"], slid, tlid, relation)) else: raise Error( f"relation target is not a known sense or synset: {target_id}" ) hyperparams = [ ("sense_relations", SENSE_QUERY, s_s_rels), ("sense_synset_relations", SYNSET_QUERY, s_ss_rels), ] for table, target_query, rels in hyperparams: query = f""" INSERT INTO {table} VALUES (null,?,({SENSE_QUERY}),({target_query}),({RELTYPE_QUERY}),?) """ for batch in _batch(rels): data = [ ( lexid, sense_id, slid, relation["target"], tlid, relation["relType"], relation.get("meta"), ) for sense_id, slid, tlid, relation in batch ] cur.executemany(query, data) progress.update(len(data)) def _insert_examples( objs: Sequence[lmf.Sense | lmf.ExternalSense | lmf.Synset | lmf.ExternalSynset], lexid: int, lexidmap: _LexIdMap, table: str, cur: sqlite3.Cursor, progress: ProgressHandler, ) -> None: progress.set(status="Examples") if table == "sense_examples": query = f"INSERT INTO {table} VALUES (null,?,({SENSE_QUERY}),?,?,?)" else: query = f"INSERT INTO {table} VALUES (null,?,({SYNSET_QUERY}),?,?,?)" for batch in _batch(objs): data = [ ( lexid, obj["id"], lexidmap.get(obj["id"], lexid), example["text"], example.get("language"), example.get("meta"), ) for obj in batch for example in obj.get("examples", []) ] # be careful of SQL injection here cur.executemany(query, data) progress.update(len(data)) def _add_ili( source: Path, progress: ProgressHandler, ) -> None: query = f""" INSERT INTO ilis VALUES (null,?,({ILISTAT_QUERY}),?,null) ON CONFLICT(id) DO UPDATE SET status_rowid=excluded.status_rowid, definition=excluded.definition """ with connect() as conn: cur = conn.cursor() progress.flash(f"Reading ILI file: {source!s}") ili = list(_ili.load_tsv(source)) progress.flash("Updating ILI Status Names") statuses = {info.get("status", "active") for info in ili} cur.executemany( "INSERT OR IGNORE INTO ili_statuses VALUES (null,?)", [(stat,) for stat in sorted(statuses)], ) progress.set(count=0, total=len(ili), status="ILI") for batch in _batch(ili): data = [ (info["ili"], info.get("status", "active"), info.get("definition")) for info in batch ] cur.executemany(query, data) progress.update(len(data)) def remove(lexicon: str, progress_handler: type[ProgressHandler] = ProgressBar) -> None: """Remove lexicon(s) from the database. The *lexicon* argument is a :ref:`lexicon specifier `. Note that this removes a lexicon and not a project, so the lexicons of projects containing multiple lexicons will need to be removed individually or, if applicable, a star specifier. The *progress_handler* parameter takes a subclass of :class:`wn.util.ProgressHandler`. An instance of the class will be created, used, and closed by this function. >>> wn.remove("ewn:2019") # removes a single lexicon >>> wn.remove("*:1.3+omw") # removes all lexicons with version 1.3+omw """ if progress_handler is None: progress_handler = ProgressHandler progress = progress_handler(message="Removing", unit="\be5 operations") conn = connect() conn.set_progress_handler(progress.update, 100000) try: for lexspec in resolve_lexicon_specifiers(lexicon=lexicon): extensions = get_lexicon_extensions(lexspec) with conn: for ext_spec in reversed(extensions): progress.set(status=f"{ext_spec} (extension)") conn.execute( "DELETE FROM lexicons WHERE specifier = ?", (ext_spec,), ) progress.flash(f"Removed {ext_spec}\n") extra = f" (and {len(extensions)} extension(s))" if extensions else "" progress.set(status=f"{lexspec}", count=0) conn.execute( "DELETE FROM lexicons WHERE specifier = ?", (lexspec,), ) progress.flash(f"Removed {lexspec}{extra}\n") finally: progress.close() conn.set_progress_handler(None, 0) def _entries(lex: _AnyLexicon) -> Sequence[_AnyEntry]: return lex.get("entries", []) def _forms(e: _AnyEntry) -> Sequence[_AnyForm]: return e.get("forms", []) def _senses(e: _AnyEntry) -> Sequence[_AnySense]: return e.get("senses", []) def _synsets(lex: _AnyLexicon) -> Sequence[_AnySynset]: return lex.get("synsets", []) def _is_external(x: _AnyForm | _AnyLemma | _AnyEntry | _AnySense | _AnySynset) -> bool: return x.get("external", False) is True def _local_synsets(synsets: Sequence[_AnySynset]) -> Iterator[lmf.Synset]: for ss in synsets: if _is_external(ss): continue yield cast("lmf.Synset", ss) def _local_entries(entries: Sequence[_AnyEntry]) -> Iterator[lmf.LexicalEntry]: for e in entries: if _is_external(e): continue yield cast("lmf.LexicalEntry", e) def _local_senses(senses: Sequence[_AnySense]) -> Iterator[lmf.Sense]: for s in senses: if _is_external(s): continue yield cast("lmf.Sense", s) wn-1.0.0/wn/_config.py000066400000000000000000000234511513755206300145520ustar00rootroot00000000000000""" Local configuration settings. """ from collections.abc import Sequence from importlib.resources import as_file, files from pathlib import Path from typing import Any try: # python_version >= 3.11 import tomllib # type: ignore except ImportError: import tomli as tomllib # type: ignore from wn._exceptions import ConfigurationError, ProjectError from wn._types import AnyPath from wn._util import format_lexicon_specifier, short_hash, split_lexicon_specifier from wn.constants import _WORDNET # The index file is a project file of Wn with as_file(files("wn") / "index.toml") as index_file: INDEX_FILE_PATH = index_file # The directory where downloaded and added data will be stored. DEFAULT_DATA_DIRECTORY = Path.home() / ".wn_data" DATABASE_FILENAME = "wn.db" class WNConfig: def __init__(self): self._data_directory = DEFAULT_DATA_DIRECTORY self._projects = {} self._dbpath = self._data_directory / DATABASE_FILENAME self.allow_multithreading = False @property def data_directory(self) -> Path: """The file system directory where Wn's data is stored. Assign a new path to change where the database and downloads are stored. >>> wn.config.data_directory = "~/.cache/wn" >>> wn.config.database_path PosixPath('/home/username/.cache/wn/wn.db') >>> wn.config.downloads_directory PosixPath('/home/username/.cache/wn/downloads') """ dir = self._data_directory dir.mkdir(exist_ok=True) return dir @data_directory.setter def data_directory(self, path): dir = Path(path).expanduser() if dir.exists() and not dir.is_dir(): raise ConfigurationError(f"path exists and is not a directory: {dir}") self._data_directory = dir self._dbpath = dir / DATABASE_FILENAME @property def database_path(self) -> Path: """The path to the database file. The database path is derived from :attr:`data_directory` and cannot be changed directly. """ return self._dbpath @property def downloads_directory(self) -> Path: """The file system directory where downloads are cached. The downloads directory is derived from :attr:`data_directory` and cannot be changed directly. """ dir = self.data_directory / "downloads" dir.mkdir(exist_ok=True) return dir @property def index(self) -> dict[str, dict]: """The project index.""" return self._projects def add_project( self, id: str, type: str = _WORDNET, label: str | None = None, language: str | None = None, license: str | None = None, error: str | None = None, ) -> None: """Add a new wordnet project to the index. Arguments: id: short identifier of the project type: project type (default 'wordnet') label: full name of the project language: `BCP 47`_ language code of the resource license: link or name of the project's default license error: if set, the error message to use when the project is accessed .. _BCP 47: https://en.wikipedia.org/wiki/IETF_language_tag """ if id in self._projects: raise ValueError(f"project already added: {id}") self._projects[id] = { "type": type, "label": label, "language": language, "versions": {}, "license": license, } if error: self._projects[id]["error"] = error def add_project_version( self, id: str, version: str, url: str | None = None, error: str | None = None, license: str | None = None, ) -> None: """Add a new resource version for a project. Exactly one of *url* or *error* must be specified. Arguments: id: short identifier of the project version: version string of the resource url: space-separated list of web addresses for the resource license: link or name of the resource's license; if not given, the project's default license will be used. error: if set, the error message to use when the project is accessed """ version_data: dict[str, Any] if url and not error: version_data = {"resource_urls": url.split()} elif error and not url: version_data = {"error": error} elif url and error: spec = format_lexicon_specifier(id, version) raise ConfigurationError(f"{spec} specifies both url and redirect") else: version_data = {} if license: version_data["license"] = license project = self._projects[id] project["versions"][version] = version_data def get_project_info(self, arg: str) -> dict: """Return information about an indexed project version. If the project has been downloaded and cached, the ``"cache"`` key will point to the path of the cached file, otherwise its value is ``None``. Arguments: arg: a project specifier Example: >>> info = wn.config.get_project_info("oewn:2021") >>> info["label"] 'Open English WordNet' """ id, version = split_lexicon_specifier(arg) if id not in self._projects: raise ProjectError(f"no such project id: {id}") project: dict = self._projects[id] if "error" in project: raise ProjectError(project["error"]) versions: dict = project["versions"] if not version or version == "*": version = next(iter(versions), "") if not version: raise ProjectError(f"no versions available for {id}") elif version not in versions: raise ProjectError(f"no such version: {version!r} ({id})") info = versions[version] if "error" in info: raise ProjectError(info["error"]) urls = info.get("resource_urls", []) return { "id": id, "version": version, "type": project["type"], "label": project["label"], "language": project["language"], "license": info.get("license", project.get("license")), "resource_urls": urls, "cache": _get_cache_path_for_urls(self, urls), } def get_cache_path(self, url: str) -> Path: """Return the path for caching *url*. Note that in general this is just a path operation and does not signify that the file exists in the file system. """ filename = short_hash(url) return self.downloads_directory / filename def update(self, data: dict) -> None: """Update the configuration with items in *data*. Items are only inserted or replaced, not deleted. If a project index is provided in the ``"index"`` key, then either the project must not already be indexed or any project fields (label, language, or license) that are specified must be equal to the indexed project. """ if "data_directory" in data: self.data_directory = data["data_directory"] for id, project in data.get("index", {}).items(): if id in self._projects: # validate that they are the same _project = self._projects[id] for attr in ("label", "language", "license"): if attr in project and project[attr] != _project[attr]: raise ConfigurationError(f"{attr} mismatch for {id}") else: self.add_project( id, type=project.get("type", _WORDNET), label=project.get("label"), language=project.get("language"), license=project.get("license"), error=project.get("error"), ) for version, info in project.get("versions", {}).items(): if "url" in info and "error" in project: spec = format_lexicon_specifier(id, version) raise ConfigurationError(f"{spec} url specified with default error") self.add_project_version( id, version, url=info.get("url"), license=info.get("license"), error=info.get("error"), ) def load_index(self, path: AnyPath) -> None: """Load and update with the project index at *path*. The project index is a TOML_ file containing project and version information. For example: .. code-block:: toml [ewn] label = "Open English WordNet" language = "en" license = "https://creativecommons.org/licenses/by/4.0/" [ewn.versions.2019] url = "https://en-word.net/static/english-wordnet-2019.xml.gz" [ewn.versions.2020] url = "https://en-word.net/static/english-wordnet-2020.xml.gz" .. _TOML: https://toml.io """ path = Path(path).expanduser() with path.open("rb") as indexfile: try: index = tomllib.load(indexfile) except tomllib.TOMLDecodeError as exc: raise ConfigurationError("malformed index file") from exc self.update({"index": index}) def _get_cache_path_for_urls( config: WNConfig, urls: Sequence[str], ) -> Path | None: for url in urls: path = config.get_cache_path(url) if path.is_file(): return path return None config = WNConfig() config.load_index(INDEX_FILE_PATH) wn-1.0.0/wn/_core.py000066400000000000000000001171711513755206300142400ustar00rootroot00000000000000from __future__ import annotations import enum from dataclasses import dataclass, field from typing import TYPE_CHECKING, Literal, TypeVar, overload from wn import taxonomy from wn._lexicon import ( LexiconConfiguration, LexiconElement, LexiconElementWithMetadata, ) from wn._queries import Pronunciation as PronunciationTuple from wn._queries import Tag as TagTuple from wn._queries import ( find_entries, find_synsets, get_adjposition, get_definitions, get_entry_forms, get_entry_senses, get_examples, get_expanded_synset_relations, get_lexfile, get_lexicalized, get_lexicon_extension_bases, get_lexicon_extensions, get_metadata, get_sense_counts, get_sense_relations, get_sense_synset_relations, get_synset_members, get_synset_relations, get_synsets_for_ilis, get_syntactic_behaviours, resolve_lexicon_specifiers, ) from wn._util import unique_list if TYPE_CHECKING: from collections.abc import Iterator, Sequence from wn._metadata import Metadata _INFERRED_SYNSET = "*INFERRED*" class _EntityType(str, enum.Enum): """Identifies the database table of an entity.""" LEXICONS = "lexicons" ENTRIES = "entries" SENSES = "senses" SYNSETS = "synsets" SENSE_RELATIONS = "sense_relations" SENSE_SYNSET_RELATIONS = "sense_synset_relations" SYNSET_RELATIONS = "synset_relations" UNSET = "" _EMPTY_LEXCONFIG = LexiconConfiguration( lexicons=(), expands=(), default_mode=False, ) class _LexiconDataElement(LexiconElementWithMetadata): """Base class for Words, Senses, and Synsets. These elements always have a required ID and are used as the starting point of secondary queries, so they also store the configuration of lexicons used in the original query. """ __slots__ = "_lexconf", "id" id: str _lexconf: LexiconConfiguration def __init__( self, id: str, _lexicon: str = "", _lexconf: LexiconConfiguration = _EMPTY_LEXCONFIG, ) -> None: self.id = id self._lexicon = _lexicon self._lexconf = _lexconf def __eq__(self, other) -> bool: if isinstance(other, type(self)) or isinstance(self, type(other)): return self.id == other.id and self._lexicon == other._lexicon return NotImplemented def __hash__(self) -> int: return hash((self.id, self._lexicon)) def _get_lexicons(self) -> tuple[str, ...]: if self._lexconf.default_mode: return ( self._lexicon, *get_lexicon_extension_bases(self._lexicon), *get_lexicon_extensions(self._lexicon), ) else: return self._lexconf.lexicons @dataclass(frozen=True, slots=True) class Pronunciation(LexiconElement): """A class for word form pronunciations.""" __module__ = "wn" value: str variety: str | None = None notation: str | None = None phonemic: bool = True audio: str | None = None _lexicon: str = field(default="", repr=False, compare=False) @dataclass(frozen=True, slots=True) class Tag(LexiconElement): """A general-purpose tag class for word forms.""" __module__ = "wn" tag: str category: str _lexicon: str = field(default="", repr=False, compare=False) @dataclass(frozen=True, slots=True) class Form(LexiconElement): """A word-form.""" __module__ = "wn" value: str id: str | None = field(default=None, repr=False, compare=False) script: str | None = field(default=None, repr=False) _lexicon: str = field(default="", repr=False, compare=False) _pronunciations: tuple[Pronunciation, ...] = field( default_factory=tuple, repr=False, compare=False ) _tags: tuple[Tag, ...] = field(default_factory=tuple, repr=False, compare=False) def pronunciations(self) -> list[Pronunciation]: return list(self._pronunciations) def tags(self) -> list[Tag]: return list(self._tags) def _make_form( form: str, id: str | None, script: str | None, lexicon: str, prons: list[PronunciationTuple], tags: list[TagTuple], ) -> Form: return Form( form, id=id, script=script, _lexicon=lexicon, _pronunciations=tuple(Pronunciation(*data) for data in prons), _tags=tuple(Tag(*data) for data in tags), ) class Word(_LexiconDataElement): """A class for words (also called lexical entries) in a wordnet.""" __slots__ = ("pos",) __module__ = "wn" _ENTITY_TYPE = _EntityType.ENTRIES pos: str def __init__( self, id: str, pos: str, _lexicon: str = "", _lexconf: LexiconConfiguration = _EMPTY_LEXCONFIG, ): super().__init__(id=id, _lexicon=_lexicon, _lexconf=_lexconf) self.pos = pos def __repr__(self) -> str: return f"Word({self.id!r})" @overload def lemma(self, *, data: Literal[False] = False) -> str: ... @overload def lemma(self, *, data: Literal[True] = True) -> Form: ... # fallback for non-literal bool argument @overload def lemma(self, *, data: bool) -> str | Form: ... def lemma(self, *, data: bool = False) -> str | Form: """Return the canonical form of the word. If the *data* argument is :python:`False` (the default), the lemma is returned as a :class:`str` type. If it is :python:`True`, a :class:`wn.Form` object is used instead. Example: >>> wn.words("wolves")[0].lemma() 'wolf' >>> wn.words("wolves")[0].lemma(data=True) Form(value='wolf') """ lexicons = self._get_lexicons() lemma_data = next(get_entry_forms(self.id, lexicons)) if data: return _make_form(*lemma_data) else: return lemma_data[0] @overload def forms(self, *, data: Literal[False] = False) -> list[str]: ... @overload def forms(self, *, data: Literal[True] = True) -> list[Form]: ... # fallback for non-literal bool argument @overload def forms(self, *, data: bool) -> list[str] | list[Form]: ... def forms(self, *, data: bool = False) -> list[str] | list[Form]: """Return the list of all encoded forms of the word. If the *data* argument is :python:`False` (the default), the forms are returned as :class:`str` types. If it is :python:`True`, :class:`wn.Form` objects are used instead. Example: >>> wn.words("wolf")[0].forms() ['wolf', 'wolves'] >>> wn.words("wolf")[0].forms(data=True) [Form(value='wolf'), Form(value='wolves')] """ lexicons = self._get_lexicons() form_data = list(get_entry_forms(self.id, lexicons)) if data: return [_make_form(*data) for data in form_data] else: return [form for form, *_ in form_data] def senses(self) -> list[Sense]: """Return the list of senses of the word. Example: >>> wn.words("zygoma")[0].senses() [Sense('ewn-zygoma-n-05292350-01')] """ lexicons = self._get_lexicons() iterable = get_entry_senses(self.id, lexicons) return [Sense(*sense_data, _lexconf=self._lexconf) for sense_data in iterable] def metadata(self) -> Metadata: """Return the word's metadata.""" return get_metadata(self.id, self._lexicon, "entries") def synsets(self) -> list[Synset]: """Return the list of synsets of the word. Example: >>> wn.words("addendum")[0].synsets() [Synset('ewn-06411274-n')] """ return [sense.synset() for sense in self.senses()] def derived_words(self) -> list[Word]: """Return the list of words linked through derivations on the senses. Example: >>> wn.words("magical")[0].derived_words() [Word('ewn-magic-n'), Word('ewn-magic-n')] """ return [ derived_sense.word() for sense in self.senses() for derived_sense in sense.get_related("derivation") ] def translate( self, lexicon: str | None = None, *, lang: str | None = None, ) -> dict[Sense, list[Word]]: """Return a mapping of word senses to lists of translated words. Arguments: lexicon: lexicon specifier of translated words lang: BCP-47 language code of translated words Example: >>> w = wn.words("water bottle", pos="n")[0] >>> for sense, words in w.translate(lang="ja").items(): ... print(sense, [jw.lemma() for jw in words]) Sense('ewn-water_bottle-n-04564934-01') ['水筒'] """ result = {} for sense in self.senses(): result[sense] = [ t_sense.word() for t_sense in sense.translate(lang=lang, lexicon=lexicon) ] return result class Relation(LexiconElementWithMetadata): """A class to model relations between senses or synsets.""" __slots__ = "_lexicon", "_metadata", "name", "source_id", "target_id" __module__ = "wn" name: str source_id: str target_id: str _metadata: Metadata | None def __init__( self, name: str, source_id: str, target_id: str, lexicon: str, *, metadata: Metadata | None = None, ): self.name = name self.source_id = source_id self.target_id = target_id self._lexicon = lexicon self._metadata = metadata def __repr__(self) -> str: return ( self.__class__.__name__ + f"({self.name!r}, {self.source_id!r}, {self.target_id!r})" ) def __eq__(self, other) -> bool: if not isinstance(other, Relation): return NotImplemented return ( self.name == other.name and self.source_id == other.source_id and self.target_id == other.target_id and self._lexicon == other._lexicon and self.subtype == other.subtype ) def __hash__(self) -> int: datum = self.name, self.source_id, self.target_id, self._lexicon, self.subtype return hash(datum) @property def subtype(self) -> str | None: """ The value of the ``dc:type`` metadata. If ``dc:type`` is not specified in the metadata, ``None`` is returned instead. """ return self.metadata().get("type") T = TypeVar("T", bound="_Relatable") class _Relatable(_LexiconDataElement): @overload def relations( self: T, *args: str, data: Literal[False] = False ) -> dict[str, list[T]]: ... @overload def relations( self: T, *args: str, data: Literal[True] = True ) -> dict[Relation, T]: ... # fallback for non-literal bool argument @overload def relations( self: T, *args: str, data: bool = False ) -> dict[str, list[T]] | dict[Relation, T]: ... def relations( self: T, *args: str, data: bool = False ) -> dict[str, list[T]] | dict[Relation, T]: raise NotImplementedError def get_related(self: T, *args: str) -> list[T]: raise NotImplementedError def closure(self: T, *args: str) -> Iterator[T]: visited = set() queue = self.get_related(*args) while queue: relatable = queue.pop(0) if relatable.id not in visited: visited.add(relatable.id) yield relatable queue.extend(relatable.get_related(*args)) def relation_paths(self: T, *args: str, end: T | None = None) -> Iterator[list[T]]: agenda: list[tuple[list[T], set[T]]] = [ ([target], {self, target}) for target in self.get_related(*args) if target != self # avoid self loops? ] while agenda: path, visited = agenda.pop() if end is not None and path[-1] == end: yield path else: related = [ target for target in path[-1].get_related(*args) if target not in visited ] if related: for synset in reversed(related): new_path = [*path, synset] new_visited = visited | {synset} agenda.append((new_path, new_visited)) elif end is None: yield path @dataclass(frozen=True, slots=True) class Example(LexiconElementWithMetadata): """Class for modeling Sense and Synset examples.""" __module__ = "wn" text: str language: str | None = None _lexicon: str = "" _metadata: Metadata | None = field(default=None, repr=False, compare=False) def metadata(self) -> Metadata: """Return the example's metadata.""" return self._metadata if self._metadata is not None else {} @dataclass(frozen=True, slots=True) class Definition(LexiconElementWithMetadata): """Class for modeling Synset definitions.""" __module__ = "wn" text: str language: str | None = None source_sense_id: str | None = field(default=None, compare=False) _lexicon: str = "" _metadata: Metadata | None = field(default=None, compare=False, repr=False) def metadata(self) -> Metadata: """Return the example's metadata.""" return self._metadata if self._metadata is not None else {} class Synset(_Relatable): """Class for modeling wordnet synsets.""" __slots__ = "_ili", "pos" __module__ = "wn" _ENTITY_TYPE = _EntityType.SYNSETS pos: str _ili: str | None def __init__( self, id: str, pos: str, ili: str | None = None, _lexicon: str = "", _lexconf: LexiconConfiguration = _EMPTY_LEXCONFIG, ): super().__init__(id=id, _lexicon=_lexicon, _lexconf=_lexconf) self.pos = pos self._ili = ili @classmethod def empty( cls, id: str, ili: str | None = None, _lexicon: str = "", _lexconf: LexiconConfiguration = _EMPTY_LEXCONFIG, ): return cls(id, pos="", ili=ili, _lexicon=_lexicon, _lexconf=_lexconf) def __eq__(self, other) -> bool: # include ili in the hash so inferred synsets don't hash the same if isinstance(other, Synset): return ( self.id == other.id and self._ili == other._ili and self._lexicon == other._lexicon ) return NotImplemented def __hash__(self) -> int: return hash((self.id, self._ili, self._lexicon)) def __repr__(self) -> str: return f"Synset({self.id!r})" @property def ili(self) -> str | None: return self._ili @overload def definition(self, *, data: Literal[False] = False) -> str | None: ... @overload def definition(self, *, data: Literal[True] = True) -> Definition | None: ... # fallback for non-literal bool argument @overload def definition(self, *, data: bool) -> str | Definition | None: ... def definition(self, *, data: bool = False) -> str | Definition | None: """Return the first definition found for the synset. If the *data* argument is :python:`False` (the default), the definition is returned as a :class:`str` type. If it is :python:`True`, a :class:`wn.Definition` object is used instead. Example: >>> wn.synsets("cartwheel", pos="n")[0].definition() 'a wheel that has wooden spokes and a metal rim' >>> wn.synsets("cartwheel", pos="n")[0].definition(data=True) [Definition(text='a wheel that has wooden spokes and a metal rim', language=None, source_sense_id=None)] """ lexicons = self._get_lexicons() if defns := get_definitions(self.id, lexicons): text, lang, sense_id, lex, meta = defns[0] if data: return Definition( text, language=lang, source_sense_id=sense_id, _lexicon=lex, _metadata=meta, ) else: return text return None @overload def definitions(self, *, data: Literal[False] = False) -> list[str]: ... @overload def definitions(self, *, data: Literal[True] = True) -> list[Definition]: ... # fallback for non-literal bool argument @overload def definitions(self, *, data: bool) -> list[str] | list[Definition]: ... def definitions(self, *, data: bool = False) -> list[str] | list[Definition]: """Return the list of definitions for the synset. If the *data* argument is :python:`False` (the default), the definitions are returned as :class:`str` objects. If it is :python:`True`, :class:`wn.Definition` objects are used instead. Example: >>> wn.synsets("tea", pos="n")[0].definitions() ['a beverage made by steeping tea leaves in water'] >>> wn.synsets("tea", pos="n")[0].definitions(data=True) [Definition(text='a beverage made by steeping tea leaves in water', language=None, source_sense_id=None)] """ lexicons = self._get_lexicons() defns = get_definitions(self.id, lexicons) if data: return [ Definition( text, language=lang, source_sense_id=sid, _lexicon=lex, _metadata=meta, ) for text, lang, sid, lex, meta in defns ] else: return [text for text, *_ in defns] @overload def examples(self, *, data: Literal[False] = False) -> list[str]: ... @overload def examples(self, *, data: Literal[True] = True) -> list[Example]: ... # fallback for non-literal bool argument @overload def examples(self, *, data: bool) -> list[str] | list[Example]: ... def examples(self, *, data: bool = False) -> list[str] | list[Example]: """Return the list of examples for the synset. If the *data* argument is :python:`False` (the default), the examples are returned as :class:`str` types. If it is :python:`True`, :class:`wn.Example` objects are used instead. Example: >>> wn.synsets("orbital", pos="a")[0].examples() ['"orbital revolution"', '"orbital velocity"'] """ lexicons = self._get_lexicons() exs = get_examples(self.id, "synsets", lexicons) if data: return [ Example(text, language=lang, _lexicon=lex, _metadata=meta) for text, lang, lex, meta in exs ] else: return [text for text, *_ in exs] def senses(self) -> list[Sense]: """Return the list of sense members of the synset. Example: >>> wn.synsets("umbrella", pos="n")[0].senses() [Sense('ewn-umbrella-n-04514450-01')] """ lexicons = self._get_lexicons() iterable = get_synset_members(self.id, lexicons) return [Sense(*sense_data, _lexconf=self._lexconf) for sense_data in iterable] def lexicalized(self) -> bool: """Return True if the synset is lexicalized.""" return get_lexicalized(self.id, self._lexicon, "synsets") def lexfile(self) -> str | None: """Return the lexicographer file name for this synset, if any.""" return get_lexfile(self.id, self._lexicon) def metadata(self) -> Metadata: """Return the synset's metadata.""" return get_metadata(self.id, self._lexicon, "synsets") def words(self) -> list[Word]: """Return the list of words linked by the synset's senses. Example: >>> wn.synsets("exclusive", pos="n")[0].words() [Word('ewn-scoop-n'), Word('ewn-exclusive-n')] """ return [sense.word() for sense in self.senses()] @overload def lemmas(self, *, data: Literal[False] = False) -> list[str]: ... @overload def lemmas(self, *, data: Literal[True] = True) -> list[Form]: ... # fallback for non-literal bool argument @overload def lemmas(self, *, data: bool) -> list[str] | list[Form]: ... def lemmas(self, *, data: bool = False) -> list[str] | list[Form]: """Return the list of lemmas of words for the synset. If the *data* argument is :python:`False` (the default), the lemmas are returned as :class:`str` types. If it is :python:`True`, :class:`wn.Form` objects are used instead. Example: >>> wn.synsets("exclusive", pos="n")[0].lemmas() ['scoop', 'exclusive'] >>> wn.synsets("exclusive", pos="n")[0].lemmas(data=True) [Form(value='scoop'), Form(value='exclusive')] """ # exploded instead of data=data due to mypy issue # https://github.com/python/mypy/issues/14764 if data: return [w.lemma(data=True) for w in self.words()] else: return [w.lemma(data=False) for w in self.words()] @overload def relations( self, *args: str, data: Literal[False] = False ) -> dict[str, list[Synset]]: ... @overload def relations( self, *args: str, data: Literal[True] = True ) -> dict[Relation, Synset]: ... # fallback for non-literal bool argument @overload def relations( self, *args: str, data: bool = False ) -> dict[str, list[Synset]] | dict[Relation, Synset]: ... def relations( self, *args: str, data: bool = False ) -> dict[str, list[Synset]] | dict[Relation, Synset]: """Return a mapping of synset relations. One or more relation names may be given as positional arguments to restrict the relations returned. If no such arguments are given, all relations starting from the synset are returned. If the *data* argument is :python:`False` (default), the returned object maps from the relation name (a :class:`str`) to a list of :class:`Synset` objects. If *data* is :python:`True`, it instead maps from a :class:`Relation` to a single :class:`Synset`. See :meth:`get_related` for getting a flat list of related synsets. Example: >>> button_rels = wn.synsets("button")[0].relations() >>> for relname, sslist in button_rels.items(): ... print(relname, [ss.lemmas() for ss in sslist]) hypernym [['fixing', 'holdfast', 'fastener', 'fastening']] hyponym [['coat button'], ['shirt button']] """ if data: return dict(self._iter_relations()) else: # inner dict is used as an order-preserving set relmap: dict[str, dict[Synset, bool]] = {} for relation, synset in self._iter_relations(*args): relmap.setdefault(relation.name, {})[synset] = True # now convert inner dicts to lists return {relname: list(ss_dict) for relname, ss_dict in relmap.items()} def get_related(self, *args: str) -> list[Synset]: """Return the list of related synsets. One or more relation names may be given as positional arguments to restrict the relations returned. If no such arguments are given, all relations starting from the synset are returned. This method does not preserve the relation names that lead to the related synsets. For a mapping of relation names to related synsets, see :meth:`relations`. Example: >>> fulcrum = wn.synsets("fulcrum")[0] >>> [ss.lemmas() for ss in fulcrum.get_related()] [['pin', 'pivot'], ['lever']] """ return unique_list(synset for _, synset in self._iter_relations(*args)) def _iter_relations(self, *args: str) -> Iterator[tuple[Relation, Synset]]: # first get relations from the current lexicon(s) yield from self._iter_local_relations(args) # then attempt to expand via ILI if self._ili is not None and self._lexconf.expands: yield from self._iter_expanded_relations(args) def _iter_local_relations( self, args: Sequence[str], ) -> Iterator[tuple[Relation, Synset]]: _lexconf = self._lexconf lexicons = self._get_lexicons() iterable = get_synset_relations( self.id, self._lexicon, args, lexicons, lexicons ) for relname, rellex, metadata, _, ssid, pos, ili, tgtlex in iterable: synset_rel = Relation(relname, self.id, ssid, rellex, metadata=metadata) synset = Synset( ssid, pos, ili, _lexicon=tgtlex, _lexconf=_lexconf, ) yield synset_rel, synset def _iter_expanded_relations( self, args: Sequence[str], ) -> Iterator[tuple[Relation, Synset]]: assert self._ili is not None, "cannot get expanded relations without an ILI" _lexconf = self._lexconf lexicons = self._get_lexicons() iterable = get_expanded_synset_relations(self._ili, args, _lexconf.expands) for relname, lexicon, metadata, srcid, ssid, _, ili, *_ in iterable: if ili is None: continue synset_rel = Relation(relname, srcid, ssid, lexicon, metadata=metadata) local_ss_rows = list(get_synsets_for_ilis([ili], lexicons=lexicons)) if local_ss_rows: for row in local_ss_rows: yield synset_rel, Synset(*row, _lexconf=_lexconf) else: synset = Synset.empty( id=_INFERRED_SYNSET, ili=ili, _lexicon=self._lexicon, _lexconf=_lexconf, ) yield synset_rel, synset def hypernym_paths(self, simulate_root: bool = False) -> list[list[Synset]]: """Return the list of hypernym paths to a root synset.""" return taxonomy.hypernym_paths(self, simulate_root=simulate_root) def min_depth(self, simulate_root: bool = False) -> int: """Return the minimum taxonomy depth of the synset.""" return taxonomy.min_depth(self, simulate_root=simulate_root) def max_depth(self, simulate_root: bool = False) -> int: """Return the maximum taxonomy depth of the synset.""" return taxonomy.max_depth(self, simulate_root=simulate_root) def shortest_path(self, other: Synset, simulate_root: bool = False) -> list[Synset]: """Return the shortest path from the synset to the *other* synset.""" return taxonomy.shortest_path(self, other, simulate_root=simulate_root) def common_hypernyms( self, other: Synset, simulate_root: bool = False ) -> list[Synset]: """Return the common hypernyms for the current and *other* synsets.""" return taxonomy.common_hypernyms(self, other, simulate_root=simulate_root) def lowest_common_hypernyms( self, other: Synset, simulate_root: bool = False ) -> list[Synset]: """Return the common hypernyms furthest from the root.""" return taxonomy.lowest_common_hypernyms( self, other, simulate_root=simulate_root ) def holonyms(self) -> list[Synset]: """Return the list of synsets related by any holonym relation. Any of the following relations are traversed: ``holonym``, ``holo_location``, ``holo_member``, ``holo_part``, ``holo_portion``, ``holo_substance``. """ return self.get_related( "holonym", "holo_location", "holo_member", "holo_part", "holo_portion", "holo_substance", ) def meronyms(self) -> list[Synset]: """Return the list of synsets related by any meronym relation. Any of the following relations are traversed: ``meronym``, ``mero_location``, ``mero_member``, ``mero_part``, ``mero_portion``, ``mero_substance``. """ return self.get_related( "meronym", "mero_location", "mero_member", "mero_part", "mero_portion", "mero_substance", ) def hypernyms(self) -> list[Synset]: """Return the list of synsets related by any hypernym relation. Both the ``hypernym`` and ``instance_hypernym`` relations are traversed. """ return self.get_related("hypernym", "instance_hypernym") def hyponyms(self) -> list[Synset]: """Return the list of synsets related by any hyponym relation. Both the ``hyponym`` and ``instance_hyponym`` relations are traversed. """ return self.get_related("hyponym", "instance_hyponym") def translate( self, lexicon: str | None = None, *, lang: str | None = None ) -> list[Synset]: """Return a list of translated synsets. Arguments: lexicon: lexicon specifier of translated synsets lang: BCP-47 language code of translated synsets Example: >>> es = wn.synsets("araña", lang="es")[0] >>> en = es.translate(lexicon="ewn")[0] >>> en.lemmas() ['spider'] """ ili = self._ili if not ili: return [] lexicons = resolve_lexicon_specifiers(lexicon=(lexicon or "*"), lang=lang) return [ Synset(*data, _lexconf=self._lexconf) for data in get_synsets_for_ilis((ili,), lexicons) ] @dataclass(frozen=True, slots=True) class Count(LexiconElementWithMetadata): """A count of sense occurrences in some corpus.""" __module__ = "wn" value: int _lexicon: str = "" _metadata: Metadata | None = field(default=None, repr=False, compare=False) class Sense(_Relatable): """Class for modeling wordnet senses.""" __slots__ = "_entry_id", "_synset_id" __module__ = "wn" _ENTITY_TYPE = _EntityType.SENSES def __init__( self, id: str, entry_id: str, synset_id: str, _lexicon: str = "", _lexconf: LexiconConfiguration = _EMPTY_LEXCONFIG, ): super().__init__(id=id, _lexicon=_lexicon, _lexconf=_lexconf) self._entry_id = entry_id self._synset_id = synset_id def __repr__(self) -> str: return f"Sense({self.id!r})" def word(self) -> Word: """Return the word of the sense. Example: >>> wn.senses("spigot")[0].word() Word('pwn-spigot-n') """ lexicons = self._get_lexicons() id, pos, lex = next(find_entries(id=self._entry_id, lexicons=lexicons)) return Word(id, pos, _lexicon=lex, _lexconf=self._lexconf) def synset(self) -> Synset: """Return the synset of the sense. Example: >>> wn.senses("spigot")[0].synset() Synset('pwn-03325088-n') """ lexicons = self._get_lexicons() id, pos, ili, lex = next(find_synsets(id=self._synset_id, lexicons=lexicons)) return Synset(id, pos, ili=ili, _lexicon=lex, _lexconf=self._lexconf) @overload def examples(self, *, data: Literal[False] = False) -> list[str]: ... @overload def examples(self, *, data: Literal[True] = True) -> list[Example]: ... # fallback for non-literal bool argument @overload def examples(self, *, data: bool) -> list[str] | list[Example]: ... def examples(self, *, data: bool = False) -> list[str] | list[Example]: """Return the list of examples for the sense. If the *data* argument is :python:`False` (the default), the examples are returned as :class:`str` types. If it is :python:`True`, :class:`wn.Example` objects are used instead. """ lexicons = self._get_lexicons() exs = get_examples(self.id, "senses", lexicons) if data: return [ Example(text, language=lang, _lexicon=lex, _metadata=meta) for text, lang, lex, meta in exs ] else: return [text for text, *_ in exs] def lexicalized(self) -> bool: """Return True if the sense is lexicalized.""" return get_lexicalized(self.id, self._lexicon, "senses") def adjposition(self) -> str | None: """Return the adjective position of the sense. Values include :python:`"a"` (attributive), :python:`"p"` (predicative), and :python:`"ip"` (immediate postnominal). Note that this is only relevant for adjectival senses. Senses for other parts of speech, or for adjectives that are not annotated with this feature, will return ``None``. """ return get_adjposition(self.id, self._lexicon) def frames(self) -> list[str]: """Return the list of subcategorization frames for the sense.""" lexicons = self._get_lexicons() return get_syntactic_behaviours(self.id, lexicons) @overload def counts(self, *, data: Literal[False] = False) -> list[int]: ... @overload def counts(self, *, data: Literal[True] = True) -> list[Count]: ... # fallback for non-literal bool argument @overload def counts(self, *, data: bool) -> list[int] | list[Count]: ... def counts(self, *, data: bool = False) -> list[int] | list[Count]: """Return the corpus counts stored for this sense.""" lexicons = self._get_lexicons() count_data = list(get_sense_counts(self.id, lexicons)) if data: return [ Count(value, _lexicon=lex, _metadata=metadata) for value, lex, metadata in count_data ] else: return [value for value, *_ in count_data] def metadata(self) -> Metadata: """Return the sense's metadata.""" return get_metadata(self.id, self._lexicon, "senses") @overload def relations( self, *args: str, data: Literal[False] = False ) -> dict[str, list[Sense]]: ... @overload def relations( self, *args: str, data: Literal[True] = True ) -> dict[Relation, Sense]: ... # fallback for non-literal bool argument @overload def relations( self, *args: str, data: bool = False ) -> dict[str, list[Sense]] | dict[Relation, Sense]: ... def relations( self, *args: str, data: bool = False ) -> dict[str, list[Sense]] | dict[Relation, Sense]: """Return a mapping of relation names to lists of senses. One or more relation names may be given as positional arguments to restrict the relations returned. If no such arguments are given, all relations starting from the sense are returned. If the *data* argument is :python:`False` (default), the returned object maps from the relation name (a :class:`str`) to a list of :class:`Sense` objects. If *data* is :python:`True`, it instead maps from a :class:`Relation` to a single :class:`Sense`. See :meth:`get_related` for getting a flat list of related senses. """ if data: return dict(self._iter_sense_relations()) else: # inner dict is used as an order-preserving set relmap: dict[str, dict[Sense, bool]] = {} for relation, sense in self._iter_sense_relations(*args): relmap.setdefault(relation.name, {})[sense] = True # now convert inner dicts to lists return {relname: list(s_dict) for relname, s_dict in relmap.items()} @overload def synset_relations( self, *args: str, data: Literal[False] = False ) -> dict[str, list[Synset]]: ... @overload def synset_relations( self, *args: str, data: Literal[True] = True ) -> dict[Relation, Synset]: ... # fallback for non-literal bool argument @overload def synset_relations( self, *args: str, data: bool = False ) -> dict[str, list[Synset]] | dict[Relation, Synset]: ... def synset_relations( self, *args: str, data: bool = False ) -> dict[str, list[Synset]] | dict[Relation, Synset]: """Return a mapping of relation names to lists of synsets. One or more relation names may be given as positional arguments to restrict the relations returned. If no such arguments are given, all relations starting from the sense are returned. If the *data* argument is :python:`False` (default), the returned object maps from the relation name (a :class:`str`) to a list of :class:`Synset` objects. If *data* is :python:`True`, it instead maps from a :class:`Relation` to a single :class:`Synset`. See :meth:`get_related_synsets` for getting a flat list of related synsets. """ if data: return dict(self._iter_sense_synset_relations()) else: # inner dict is used as an order-preserving set relmap: dict[str, dict[Synset, bool]] = {} for relation, synset in self._iter_sense_synset_relations(*args): relmap.setdefault(relation.name, {})[synset] = True # now convert inner dicts to lists return {relname: list(ss_dict) for relname, ss_dict in relmap.items()} def get_related(self, *args: str) -> list[Sense]: """Return a list of related senses. One or more relation types should be passed as arguments which determine the kind of relations returned. Example: >>> physics = wn.senses("physics", lexicon="ewn")[0] >>> for sense in physics.get_related("has_domain_topic"): ... print(sense.word().lemma()) coherent chaotic incoherent """ return unique_list(sense for _, sense in self._iter_sense_relations(*args)) def get_related_synsets(self, *args: str) -> list[Synset]: """Return a list of related synsets.""" return unique_list( synset for _, synset in self._iter_sense_synset_relations(*args) ) def _iter_sense_relations(self, *args: str) -> Iterator[tuple[Relation, Sense]]: lexicons = self._get_lexicons() iterable = get_sense_relations(self.id, args, lexicons, lexicons) for relname, lexicon, metadata, sid, eid, ssid, lexid in iterable: relation = Relation(relname, self.id, sid, lexicon, metadata=metadata) sense = Sense(sid, eid, ssid, lexid, _lexconf=self._lexconf) yield relation, sense def _iter_sense_synset_relations( self, *args: str, ) -> Iterator[tuple[Relation, Synset]]: lexicons = self._get_lexicons() iterable = get_sense_synset_relations(self.id, args, lexicons, lexicons) for relname, lexicon, metadata, _, ssid, pos, ili, lexid in iterable: relation = Relation(relname, self.id, ssid, lexicon, metadata=metadata) synset = Synset(ssid, pos, ili, lexid, _lexconf=self._lexconf) yield relation, synset def translate( self, lexicon: str | None = None, *, lang: str | None = None ) -> list[Sense]: """Return a list of translated senses. Arguments: lexicon: lexicon specifier of translated senses lang: BCP-47 language code of translated senses Example: >>> en = wn.senses("petiole", lang="en")[0] >>> pt = en.translate(lang="pt")[0] >>> pt.word().lemma() 'pecíolo' """ synset = self.synset() return [ t_sense for t_synset in synset.translate(lang=lang, lexicon=lexicon) for t_sense in t_synset.senses() ] wn-1.0.0/wn/_db.py000066400000000000000000000112001513755206300136570ustar00rootroot00000000000000""" Storage back-end interface. """ import json import logging import sqlite3 from importlib import resources from pathlib import Path from wn._config import config from wn._exceptions import DatabaseError from wn._types import AnyPath from wn._util import format_lexicon_specifier, short_hash logger = logging.getLogger("wn") # Module Constants DEBUG = False # This stores hashes of the schema to check for version differences. # When the schema changes, the hash will change. If the new hash is # not added here, the 'test_schema_compatibility' test will fail. It # is the developer's responsibility to only add compatible schema # hashes here. If the schema change is not backwards-compatible, then # clear all old hashes and only put the latest hash here. A hash can # be generated like this: # # >>> import sqlite3 # >>> import wn # >>> conn = sqlite3.connect(wn.config.database_path) # >>> wn._db.schema_hash(conn) # COMPATIBLE_SCHEMA_HASHES = { "8348fc1a6254f514294a1dc70458e0733742935d", } # Optional metadata is stored as a JSON string def _adapt_dict(d: dict) -> bytes: return json.dumps(d).encode("utf-8") def _convert_dict(s: bytes) -> dict: return json.loads(s) def _convert_boolean(s: bytes) -> bool: return bool(int(s)) sqlite3.register_adapter(dict, _adapt_dict) sqlite3.register_converter("meta", _convert_dict) sqlite3.register_converter("boolean", _convert_boolean) # The pool is a cache of open connections. Unless the database path is # changed, there should only be zero or one. pool: dict[AnyPath, sqlite3.Connection] = {} # The connect() function should be used for all connections def connect(check_schema: bool = True) -> sqlite3.Connection: dbpath = config.database_path if dbpath not in pool: if not config.data_directory.exists(): config.data_directory.mkdir(parents=True, exist_ok=True) initialized = dbpath.is_file() conn = sqlite3.connect( str(dbpath), detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=not config.allow_multithreading, ) # foreign key support needs to be enabled for each connection conn.execute("PRAGMA foreign_keys = ON") if DEBUG: conn.set_trace_callback(print) if not initialized: logger.info("initializing database: %s", dbpath) _init_db(conn) if check_schema: _check_schema_compatibility(conn, dbpath) pool[dbpath] = conn return pool[dbpath] def _init_db(conn: sqlite3.Connection) -> None: schema = (resources.files("wn") / "schema.sql").read_text() conn.executescript(schema) with conn: conn.executemany( "INSERT INTO ili_statuses VALUES (null,?)", [("presupposed",), ("proposed",)], ) def _check_schema_compatibility(conn: sqlite3.Connection, dbpath: Path) -> None: hash = schema_hash(conn) # if the hash is known, then we're all good here if hash in COMPATIBLE_SCHEMA_HASHES: return logger.debug("current schema hash:\n %s", hash) logger.debug( "compatible schema hashes:\n %s", "\n ".join(COMPATIBLE_SCHEMA_HASHES) ) # otherwise, try to raise a helpful error message msg = "Wn's schema has changed and is no longer compatible with the database." try: specs = list_lexicons_safe(conn) except DatabaseError as exc: raise DatabaseError(msg) from exc if specs: installed = "\n ".join(specs) msg += ( f"\nLexicons currently installed:\n {installed}" "\nRun wn.reset_database(rebuild=True) to rebuild the database." ) else: msg += ( "\nNo lexicons are currently installed." "\nRun wn.reset_database() to re-initialize the database." ) raise DatabaseError(msg) def list_lexicons_safe(conn: sqlite3.Connection | None = None) -> list[str]: """Return the list of lexicon specifiers for added lexicons.""" if conn is None: conn = connect(check_schema=False) try: specs = conn.execute("SELECT id, version FROM lexicons").fetchall() except sqlite3.OperationalError as exc: raise DatabaseError("could not list lexicons") from exc return [format_lexicon_specifier(id, ver) for id, ver in specs] def schema_hash(conn: sqlite3.Connection) -> str: query = "SELECT sql FROM sqlite_master WHERE NOT sql ISNULL" schema = "\n\n".join(row[0] for row in conn.execute(query)) return short_hash(schema) def clear_connections() -> None: """Close and delete any open database connections.""" for path in list(pool): pool[path].close() del pool[path] wn-1.0.0/wn/_download.py000066400000000000000000000113231513755206300151070ustar00rootroot00000000000000import logging from collections.abc import Sequence from pathlib import Path import httpx from wn._add import add as add_to_db from wn._config import config from wn._exceptions import Error from wn._util import is_url from wn.util import ProgressBar, ProgressHandler CHUNK_SIZE = 8 * 1024 # how many KB to read at a time TIMEOUT = 10 # number of seconds to wait for a server response logger = logging.getLogger("wn") def download( project_or_url: str, add: bool = True, progress_handler: type[ProgressHandler] | None = ProgressBar, ) -> Path: """Download the resource specified by *project_or_url*. First the URL of the resource is determined and then, depending on the parameters, the resource is downloaded and added to the database. The function then returns the path of the cached file. If *project_or_url* starts with `'http://'` or `'https://'`, then it is taken to be the URL for the resource. Otherwise, *project_or_url* is taken as a :ref:`project specifier ` and the URL is taken from a matching entry in Wn's project index. If no project matches the specifier, :exc:`wn.Error` is raised. If the URL has been downloaded and cached before, the cached file is used. Otherwise the URL is retrieved and stored in the cache. If the *add* paramter is ``True`` (default), the downloaded resource is added to the database. >>> wn.download("ewn:2020") Added ewn:2020 (English WordNet) The *progress_handler* parameter takes a subclass of :class:`wn.util.ProgressHandler`. An instance of the class will be created, used, and closed by this function. """ if progress_handler is None: progress_handler = ProgressHandler progress = progress_handler(message="Download", unit=" bytes") cache_path, urls = _get_cache_path_and_urls(project_or_url) try: if cache_path and cache_path.exists(): progress.flash(f"Cached file found: {cache_path!s}") path = cache_path elif urls: path = _download(urls, progress) else: raise Error("no urls to download") finally: progress.close() if add: try: add_to_db(path, progress_handler=progress_handler) except Error as exc: raise Error( f"could not add downloaded file: {path}\n You might try " "deleting the cached file and trying the download again." ) from exc return path def _get_cache_path_and_urls(project_or_url: str) -> tuple[Path | None, list[str]]: if is_url(project_or_url): return config.get_cache_path(project_or_url), [project_or_url] else: info = config.get_project_info(project_or_url) return info.get("cache"), info["resource_urls"] def _download(urls: Sequence[str], progress: ProgressHandler) -> Path: client = httpx.Client(timeout=TIMEOUT, follow_redirects=True) try: for i, url in enumerate(urls, 1): path = config.get_cache_path(url) logger.info("download url: %s", url) logger.info("download cache path: %s", path) try: with open(path, "wb") as f: progress.set(status="Requesting", count=0) with client.stream("GET", url) as response: response.raise_for_status() total = int(response.headers.get("Content-Length", 0)) count = response.num_bytes_downloaded progress.set(count=count, total=total, status="Receiving") for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE): if chunk: f.write(chunk) progress.update(response.num_bytes_downloaded - count) count = response.num_bytes_downloaded progress.set(status="Complete") except httpx.RequestError as exc: path.unlink(missing_ok=True) last_count = progress.kwargs["count"] if i == len(urls): raise Error(f"download failed at {last_count} bytes") from exc else: logger.info( "download failed at %d bytes; trying next url", last_count ) else: break # success except KeyboardInterrupt as exc: path.unlink(missing_ok=True) last_count = progress.kwargs["count"] raise Error(f"download cancelled at {last_count} bytes") from exc except Exception: path.unlink(missing_ok=True) raise finally: client.close() return path wn-1.0.0/wn/_exceptions.py000066400000000000000000000012431513755206300154610ustar00rootroot00000000000000class Error(Exception): """Generic error class for invalid wordnet operations.""" # reset the module so the user sees the public name __module__ = "wn" class DatabaseError(Error): """Error class for issues with the database.""" __module__ = "wn" class ConfigurationError(Error): """Raised on invalid configurations.""" __module__ = "wn" class ProjectError(Error): """Raised when a project is not found or on errors defined in the index.""" __module__ = "wn" class WnWarning(Warning): """Generic warning class for dubious wordnet operations.""" # reset the module so the user sees the public name __module__ = "wn" wn-1.0.0/wn/_export.py000066400000000000000000000514751513755206300146350ustar00rootroot00000000000000from collections.abc import Iterator, Sequence from typing import Literal, NamedTuple, overload from wn import lmf from wn._exceptions import Error from wn._lexicon import Lexicon from wn._queries import ( Form, Pronunciation, Sense, Tag, find_entries, find_proposed_ilis, find_senses, find_synsets, find_syntactic_behaviours, get_adjposition, get_definitions, get_entry_forms, get_entry_index, get_entry_senses, get_examples, get_lexfile, get_lexicalized, get_lexicon_dependencies, get_metadata, get_proposed_ili_metadata, get_relation_targets, get_sense_counts, get_sense_n, get_sense_relations, get_sense_synset_relations, get_synset_members, get_synset_relations, ) from wn._types import AnyPath, VersionInfo from wn._util import split_lexicon_specifier, version_info PROPOSED_ILI_ID = "in" # special case for proposed ILIs def export( lexicons: Sequence[Lexicon], destination: AnyPath, version: str = "1.4" ) -> None: """Export lexicons from the database to a WN-LMF file. More than one lexicon may be exported in the same file, subject to these conditions: - identifiers on wordnet entities must be unique in all lexicons - lexicons extensions may not be exported with their dependents >>> w = wn.Wordnet(lexicon="omw-cmn:1.4 omw-zsm:1.4") >>> wn.export(w.lexicons(), "cmn-zsm.xml") Args: lexicons: sequence of :class:`wn.Lexicon` objects destination: path to the destination file version: LMF version string """ _precheck(lexicons) exporter = _LMFExporter(version) resource: lmf.LexicalResource = { "lmf_version": version, "lexicons": [exporter.export(lexicon) for lexicon in lexicons], } lmf.dump(resource, destination) def _precheck(lexicons: Sequence[Lexicon]) -> None: all_ids: set[str] = set() for lex in lexicons: lexspecs = (lex.specifier(),) idset = {lex.id} idset.update(row[0] for row in find_entries(lexicons=lexspecs)) idset.update(row[0] for row in find_senses(lexicons=lexspecs)) idset.update(row[0] for row in find_synsets(lexicons=lexspecs)) # TODO: syntactic behaviours if all_ids.intersection(idset): raise Error("cannot export: non-unique identifiers in lexicons") all_ids |= idset _SBMap = dict[str, list[tuple[str, str]]] class _LexSpecs(NamedTuple): primary: str # lexicon or lexicon extension being exported base: str # base lexicon (when primary is an extension) class _LMFExporter: version: VersionInfo # ids: set[str] # The following are reset for each lexicon that is exported lexspecs: _LexSpecs sbmap: _SBMap external_sense_ids: set[str] # necessary external senses external_synset_ids: set[str] # necessary external synsets def __init__(self, version: str) -> None: if version not in lmf.SUPPORTED_VERSIONS: raise Error(f"WN-LMF version not supported: {version}") self.version = version_info(version) self.lexspecs = _LexSpecs("", "") self.sbmap = {} self.external_sense_ids = set() self.external_synset_ids = set() def export(self, lexicon: Lexicon) -> lmf.Lexicon | lmf.LexiconExtension: base = lexicon.extends() self.lexspecs = _LexSpecs(lexicon.specifier(), base.specifier() if base else "") self.sbmap = _build_sbmap(self.lexspecs) if base is None: return self._lexicon(lexicon) else: self.external_sense_ids = _get_external_sense_ids(self.lexspecs) self.external_synset_ids = _get_external_synset_ids(self.lexspecs) return self._lexicon_extension(lexicon, base) def _lexicon(self, lexicon: Lexicon) -> lmf.Lexicon: lex = lmf.Lexicon( id=lexicon.id, label=lexicon.label, language=lexicon.language, email=lexicon.email, license=lexicon.license, version=lexicon.version, url=lexicon.url or "", citation=lexicon.citation or "", entries=list(self._entries(False)), synsets=list(self._synsets(False)), meta=lexicon.metadata(), ) if self.version >= (1, 1): lex["logo"] = lexicon.logo or "" lex["requires"] = self._requires() lex["frames"] = self._syntactic_behaviours_1_1() return lex def _requires(self) -> list[lmf.Dependency]: dependencies: list[lmf.Dependency] = [] for specifier, url, _ in get_lexicon_dependencies(self.lexspecs.primary): id, version = split_lexicon_specifier(specifier) dependencies.append(self._dependency(id, version, url)) return dependencies def _dependency(self, id: str, version: str, url: str | None) -> lmf.Dependency: return lmf.Dependency(id=id, version=version, url=url) @overload def _entries( self, extension: Literal[True] ) -> Iterator[lmf.LexicalEntry | lmf.ExternalLexicalEntry]: ... @overload def _entries(self, extension: Literal[False]) -> Iterator[lmf.LexicalEntry]: ... def _entries( self, extension: Literal[True, False] ) -> Iterator[lmf.LexicalEntry | lmf.ExternalLexicalEntry]: lexspec = self.lexspecs.primary lexicons = self.lexspecs if extension else (lexspec,) for id, pos, lex in find_entries(lexicons=lexicons): if lex == lexspec: yield self._entry(id, pos) elif extension and (entry := self._ext_entry(id)): yield entry def _entry(self, id: str, pos: str) -> lmf.LexicalEntry: lexspec = self.lexspecs.primary lemma, forms = _get_entry_forms(id, self.lexspecs) index = get_entry_index(id, lexspec) entry = lmf.LexicalEntry( id=id, lemma=self._lemma(lemma, pos), forms=[self._form(form) for form in forms], index=index or "", senses=list(self._senses(id, index, False)), meta=self._metadata(id, "entries"), ) if self.version < (1, 1): # cleanup 1.1+ features entry["lemma"].pop("pronunciations", None) for form in entry["forms"]: form.pop("pronunciations", None) # 1.0 has syntactic behaviours on each entry entry["frames"] = self._syntactic_behaviours_1_0(entry) if self.version < (1, 4) and index: entry.pop("index", None) return entry def _lemma(self, form: Form, pos: str) -> lmf.Lemma: return lmf.Lemma( writtenForm=form[0], partOfSpeech=pos, script=(form[2] or ""), pronunciations=self._pronunciations(form[4]), tags=self._tags(form[5]), ) def _form(self, form: Form) -> lmf.Form: return lmf.Form( writtenForm=form[0], id=form[1] or "", script=form[2] or "", pronunciations=self._pronunciations(form[4]), tags=self._tags(form[5]), ) def _pronunciations(self, prons: list[Pronunciation]) -> list[lmf.Pronunciation]: lexspec = self.lexspecs.primary return [ lmf.Pronunciation( text=text, variety=variety or "", notation=notation or "", phonemic=phonemic, audio=audio or "", ) for text, variety, notation, phonemic, audio, lex in prons if lex == lexspec ] def _tags(self, tags: list[Tag]) -> list[lmf.Tag]: lexspec = self.lexspecs.primary return [ lmf.Tag(text=text, category=category) for text, category, lex in tags if lex == lexspec ] @overload def _senses( self, id: str, index: str | None, extension: Literal[True] ) -> Iterator[lmf.Sense | lmf.ExternalSense]: ... @overload def _senses( self, id: str, index: str | None, extension: Literal[False] ) -> Iterator[lmf.Sense]: ... def _senses( self, id: str, index: str | None, extension: Literal[True, False] ) -> Iterator[lmf.Sense | lmf.ExternalSense]: lexspec = self.lexspecs.primary lexicons = self.lexspecs if extension else (lexspec,) for i, sense in enumerate(get_entry_senses(id, lexicons, False), 1): sid, _, _, lex = sense if lex == lexspec: yield self._sense(sense, index, i) elif extension and (ext_sense := self._ext_sense(sid)): yield ext_sense def _sense(self, sense: Sense, index: str | None, i: int) -> lmf.Sense: id, _, synset_id, lexspec = sense lmf_sense = lmf.Sense( id=id, synset=synset_id, n=_get_sense_n(id, lexspec, index, i), relations=self._sense_relations(id), examples=self._examples(id, "senses"), counts=self._counts(id), meta=self._metadata(id, "senses"), lexicalized=get_lexicalized(id, lexspec, "senses"), adjposition=get_adjposition(id, lexspec) or "", ) if self.version >= (1, 1) and id in self.sbmap: lmf_sense["subcat"] = sorted(sbid for sbid, _ in self.sbmap[id]) return lmf_sense def _sense_relations(self, sense_id: str) -> list[lmf.Relation]: # only get relations defined for the primary lexicon, but the # relation target can be from a base lexicon lexicons = (self.lexspecs.primary,) relations: list[lmf.Relation] = [ lmf.Relation(target=id, relType=type, meta=metadata) for type, _, metadata, id, *_ in get_sense_relations( sense_id, "*", lexicons, self.lexspecs ) ] relations.extend( lmf.Relation(target=id, relType=type, meta=metadata) for type, _, metadata, _, id, *_ in get_sense_synset_relations( sense_id, "*", lexicons, self.lexspecs ) ) return relations def _examples(self, id: str, table: str) -> list[lmf.Example]: lexicons = (self.lexspecs.primary,) # only for the lexicon being exported return [ lmf.Example(text=text, language=language, meta=metadata) for text, language, _, metadata in get_examples(id, table, lexicons) ] def _counts(self, sense_id: str) -> list[lmf.Count]: lexicons = (self.lexspecs.primary,) # only for the lexicon being exported return [ lmf.Count(value=val, meta=metadata) for val, _, metadata in get_sense_counts(sense_id, lexicons) ] @overload def _synsets( self, extension: Literal[True] ) -> Iterator[lmf.Synset | lmf.ExternalSynset]: ... @overload def _synsets(self, extension: Literal[False]) -> Iterator[lmf.Synset]: ... def _synsets( self, extension: Literal[True, False] ) -> Iterator[lmf.Synset | lmf.ExternalSynset]: lexspec = self.lexspecs.primary lexicons = self.lexspecs if extension else (lexspec,) for id, pos, ili, lex in find_synsets(lexicons=lexicons): if lex == lexspec: yield self._synset(id, pos, ili) elif extension and (ext_synset := self._ext_synset(id)): yield ext_synset def _synset(self, id: str, pos: str, ili: str) -> lmf.Synset: lexspec = self.lexspecs.primary lexicons = (lexspec,) ilidef = self._ili_definition(id) if ilidef and not ili: ili = PROPOSED_ILI_ID ss = lmf.Synset( id=id, ili=ili or "", partOfSpeech=pos, definitions=self._definitions(id), relations=self._synset_relations(id, lexspec), examples=self._examples(id, "synsets"), lexicalized=get_lexicalized(id, lexspec, "synsets"), lexfile=get_lexfile(id, lexspec) or "", meta=self._metadata(id, "synsets"), ) if ilidef: ss["ili_definition"] = ilidef if self.version >= (1, 1): ss["members"] = [row[0] for row in get_synset_members(id, lexicons)] return ss def _definitions(self, synset_id: str) -> list[lmf.Definition]: lexicons = (self.lexspecs.primary,) # only for the lexicon being exported return [ lmf.Definition( text=text, language=language, sourceSense=sense_id, meta=metadata, ) for text, language, sense_id, _, metadata in get_definitions( synset_id, lexicons ) ] def _ili_definition(self, synset: str) -> lmf.ILIDefinition | None: lexicons = (self.lexspecs.primary,) # only for the lexicon being exported _, lexspec, defn, _ = next( find_proposed_ilis(synset_id=synset, lexicons=lexicons), (None, None, None, None), ) ilidef: lmf.ILIDefinition | None = None if defn: meta = None if lexspec is not None: meta = get_proposed_ili_metadata(synset, lexspec) ilidef = lmf.ILIDefinition(text=defn, meta=meta) return ilidef def _synset_relations( self, synset_id: str, synset_lexicon: str ) -> list[lmf.Relation]: # only get relations defined for the primary lexicon, but the # relation target can be from a base lexicon lexicons = (self.lexspecs.primary,) return [ lmf.Relation(target=id, relType=type, meta=metadata) for type, _, metadata, _, id, *_ in get_synset_relations( synset_id, synset_lexicon, "*", lexicons, self.lexspecs ) ] def _syntactic_behaviours_1_0( self, entry: lmf.LexicalEntry, ) -> list[lmf.SyntacticBehaviour]: frames: list[lmf.SyntacticBehaviour] = [] sense_ids = {s["id"] for s in entry.get("senses", [])} sbs: dict[str, set[str]] = {} for sid in sense_ids: for _, subcat_frame in self.sbmap.get(sid, []): sbs.setdefault(subcat_frame, set()).add(sid) for subcat_frame, sids in sbs.items(): frame: lmf.SyntacticBehaviour = { "subcategorizationFrame": subcat_frame, "senses": sorted(sids), } frames.append(frame) return frames def _syntactic_behaviours_1_1(self) -> list[lmf.SyntacticBehaviour]: lexicons = (self.lexspecs.primary,) # only for the lexicon being exported return [ lmf.SyntacticBehaviour(id=id or "", subcategorizationFrame=frame) for id, frame, _ in find_syntactic_behaviours(lexicons=lexicons) ] def _metadata(self, id: str, table: str) -> lmf.Metadata: return get_metadata(id, self.lexspecs.primary, table) ### Lexicon Extensions ################################################### def _lexicon_extension( self, lexicon: Lexicon, base: Lexicon ) -> lmf.LexiconExtension: lexspec = self.lexspecs.primary if self.version < (1, 1): raise Error( f"cannot export lexicon extension {lexspec} with WN-LMF version < 1.1" ) lex = lmf.LexiconExtension( id=lexicon.id, label=lexicon.label, language=lexicon.language, email=lexicon.email, license=lexicon.license, version=lexicon.version, url=lexicon.url or "", citation=lexicon.citation or "", logo=lexicon.logo or "", extends=self._dependency(base.id, base.version, base.url), requires=self._requires(), entries=list(self._entries(True)), synsets=list(self._synsets(True)), frames=self._syntactic_behaviours_1_1(), meta=lexicon.metadata(), ) return lex def _ext_entry(self, id: str) -> lmf.ExternalLexicalEntry | None: lexspec = self.lexspecs.primary lemma, forms = _get_entry_forms(id, self.lexspecs) index = get_entry_index(id, lexspec) ext_lemma = self._ext_lemma(lemma) ext_forms = self._ext_forms(forms) ext_senses = list(self._senses(id, index, True)) if ext_lemma or ext_forms or ext_senses: return lmf.ExternalLexicalEntry( external=True, id=id, lemma=ext_lemma, forms=ext_forms, senses=ext_senses, ) return None def _ext_lemma(self, lemma: Form) -> lmf.ExternalLemma | None: _, _, _, _, pronunciations, tags = lemma ext_prons = self._pronunciations(pronunciations) ext_tags = self._tags(tags) if ext_prons or ext_tags: return lmf.ExternalLemma( external=True, pronunciations=ext_prons, tags=ext_tags, ) return None def _ext_forms(self, forms: list[Form]) -> list[lmf.Form | lmf.ExternalForm]: lexspec = self.lexspecs.primary ext_forms: list[lmf.Form | lmf.ExternalForm] = [] for form in forms: if form[3] == lexspec: ext_forms.append(self._form(form)) elif ext_form := self._ext_form(form): ext_forms.append(ext_form) return ext_forms def _ext_form(self, form: Form) -> lmf.ExternalForm | None: value, id, _, _, prons, tags = form ext_prons = self._pronunciations(prons) ext_tags = self._tags(tags) if ext_prons or ext_tags: if not id: raise Error(f"cannot export external form {value!r} without an id") return lmf.ExternalForm( external=True, id=id, pronunciations=ext_prons, tags=ext_tags, ) return None def _ext_sense(self, id: str) -> lmf.ExternalSense | None: ext_relations = self._sense_relations(id) ext_examples = self._examples(id, "senses") ext_counts = self._counts(id) if ext_relations or ext_examples or ext_counts or id in self.external_sense_ids: return lmf.ExternalSense( external=True, id=id, relations=ext_relations, examples=ext_examples, counts=ext_counts, ) return None def _ext_synset(self, id: str) -> lmf.ExternalSynset | None: ext_definitions = self._definitions(id) ext_relations = self._synset_relations(id, self.lexspecs.base) ext_examples = self._examples(id, "synsets") if ( ext_definitions or ext_relations or ext_examples or id in self.external_synset_ids ): return lmf.ExternalSynset( external=True, id=id, definitions=ext_definitions, relations=ext_relations, examples=ext_examples, ) return None ### Helper Functions ######################################################### def _build_sbmap(lexicons: Sequence[str]) -> _SBMap: # WN-LMF 1.0 lexicons put syntactic behaviours on lexical entries # WN-LMF 1.1 lexicons use a 'subcat' IDREFS attribute sbmap: _SBMap = {} for sbid, frame, sids in find_syntactic_behaviours(lexicons=lexicons): for sid in sids: sbmap.setdefault(sid, []).append((sbid, frame)) return sbmap def _get_entry_forms(id: str, lexicons: Sequence[str]) -> tuple[Form, list[Form]]: all_forms: list[Form] = list(get_entry_forms(id, lexicons)) # the first result is always the lemma return all_forms[0], all_forms[1:] def _get_sense_n(id: str, lexspec: str, index: str | None, i: int) -> int: """Get the n rank value for a sense. The n value is only informative if it is non-None and different from the expected rank i. If an index is used, always return a non-None value of n, even if it is the expected rank. """ n = get_sense_n(id, lexspec) if n is not None and (index is not None or n != i): return n return 0 def _get_external_sense_ids(lexspecs: _LexSpecs) -> set[str]: """Get ids of external senses needed for an extension.""" return get_relation_targets( "sense_relations", "senses", (lexspecs.primary,), lexspecs ) def _get_external_synset_ids(lexspecs: _LexSpecs) -> set[str]: """Get ids of external synsets needed for an extension.""" return ( get_relation_targets( "synset_relations", "synsets", (lexspecs.primary,), lexspecs ) | get_relation_targets( "sense_synset_relations", "synsets", (lexspecs.primary,), lexspecs ) | { sense[2] for sense in find_senses(lexicons=lexspecs) if sense[3] != lexspecs.base } ) wn-1.0.0/wn/_lexicon.py000066400000000000000000000131771513755206300147520ustar00rootroot00000000000000from __future__ import annotations from dataclasses import dataclass, field from typing import TYPE_CHECKING, NamedTuple, Protocol, TypeVar from wn._metadata import HasMetadata from wn._queries import ( find_entries, find_ilis, find_senses, find_synsets, get_lexicon, get_lexicon_dependencies, get_lexicon_extension_bases, get_lexicon_extensions, get_modified, ) if TYPE_CHECKING: from collections.abc import Callable, Sequence from wn._metadata import Metadata DEFAULT_CONFIDENCE = 1.0 Self = TypeVar("Self", bound="Lexicon") # typing.Self, python_version>=3.11 @dataclass(repr=False, eq=True, frozen=True, slots=True) class Lexicon(HasMetadata): """A class representing a wordnet lexicon.""" __module__ = "wn" _specifier: str id: str label: str language: str email: str license: str version: str url: str | None = None citation: str | None = None logo: str | None = None _metadata: Metadata | None = field(default=None, hash=False) @classmethod def from_specifier(cls: type[Self], specifier: str) -> Self: data = get_lexicon(specifier) spec, id, label, lang, email, license, version, url, citation, logo, meta = data return cls( spec, id, label, lang, email, license, version, url=url, citation=citation, logo=logo, _metadata=meta, ) def __repr__(self): return f"" def specifier(self) -> str: """Return the *id:version* lexicon specifier.""" return self._specifier def confidence(self) -> float: """Return the confidence score of the lexicon. If the lexicon does not specify a confidence score, it defaults to 1.0. """ return float(self.metadata().get("confidenceScore", DEFAULT_CONFIDENCE)) def modified(self) -> bool: """Return True if the lexicon has local modifications.""" return get_modified(self._specifier) def requires(self) -> dict[str, Lexicon | None]: """Return the lexicon dependencies.""" return { spec: (None if added is None else Lexicon.from_specifier(spec)) for spec, _, added in get_lexicon_dependencies(self._specifier) } def extends(self) -> Lexicon | None: """Return the lexicon this lexicon extends, if any. If this lexicon is not an extension, return None. """ bases = get_lexicon_extension_bases(self._specifier, depth=1) if bases: return Lexicon.from_specifier(bases[0]) return None def extensions(self, depth: int = 1) -> list[Lexicon]: """Return the list of lexicons extending this one. By default, only direct extensions are included. This is controlled by the *depth* parameter, which if you view extensions as children in a tree where the current lexicon is the root, *depth=1* are the immediate extensions. Increasing this number gets extensions of extensions, or setting it to a negative number gets all "descendant" extensions. """ return [ Lexicon.from_specifier(spec) for spec in get_lexicon_extensions(self._specifier, depth=depth) ] def describe(self, full: bool = True) -> str: """Return a formatted string describing the lexicon. The *full* argument (default: :python:`True`) may be set to :python:`False` to omit word and sense counts. Also see: :meth:`Wordnet.describe` """ lexspecs = (self.specifier(),) substrings: list[str] = [ f"{self._specifier}", f" Label : {self.label}", f" URL : {self.url}", f" License: {self.license}", ] if full: substrings.extend( [ f" Words : {_desc_counts(find_entries, lexspecs)}", f" Senses : {sum(1 for _ in find_senses(lexicons=lexspecs))}", ] ) substrings.extend( [ f" Synsets: {_desc_counts(find_synsets, lexspecs)}", f" ILIs : {sum(1 for _ in find_ilis(lexicons=lexspecs)):>6}", ] ) return "\n".join(substrings) def _desc_counts(query: Callable, lexspecs: Sequence[str]) -> str: count: dict[str, int] = {} for _, pos, *_ in query(lexicons=lexspecs): if pos not in count: count[pos] = 1 else: count[pos] += 1 subcounts = ", ".join(f"{pos}: {count[pos]}" for pos in sorted(count)) return f"{sum(count.values()):>6} ({subcounts})" class LexiconElement(Protocol): """Protocol for elements defined within a lexicon.""" _lexicon: str # source lexicon specifier def lexicon(self) -> Lexicon: """Return the lexicon containing the element.""" return Lexicon.from_specifier(self._lexicon) class LexiconElementWithMetadata(LexiconElement, HasMetadata, Protocol): """Protocol for lexicon elements with metadata.""" def confidence(self) -> float: """Return the confidence score of the element. If the element does not have an explicit confidence score, the value defaults to that of the lexicon containing the element. """ c = self.metadata().get("confidenceScore") if c is None: c = self.lexicon().confidence() return float(c) class LexiconConfiguration(NamedTuple): lexicons: tuple[str, ...] expands: tuple[str, ...] default_mode: bool wn-1.0.0/wn/_metadata.py000066400000000000000000000017001513755206300150560ustar00rootroot00000000000000from typing import Protocol, TypedDict class Metadata(TypedDict, total=False): # For these, see https://globalwordnet.github.io/schemas/dc/ contributor: str coverage: str creator: str date: str description: str format: str identifier: str publisher: str relation: str rights: str source: str subject: str title: str type: str # Additional WN-LMF metadata status: str note: str confidenceScore: float class HasMetadata(Protocol): @property def _metadata(self) -> Metadata | None: return None def metadata(self) -> Metadata: """Return the associated metadata.""" return self._metadata if self._metadata is not None else Metadata() def confidence(self) -> float: """Return the confidence score. If the confidenceScore metadata is available, return it. If not, use a default confidence value. """ ... wn-1.0.0/wn/_module_functions.py000066400000000000000000000160511513755206300166600ustar00rootroot00000000000000from typing import Literal, overload from wn._config import config from wn._core import Form, Sense, Synset, Word from wn._db import clear_connections, connect, list_lexicons_safe from wn._download import download from wn._exceptions import Error from wn._lexicon import Lexicon from wn._util import format_lexicon_specifier from wn._wordnet import Wordnet def projects() -> list[dict]: """Return the list of indexed projects. This returns the same dictionaries of information as :meth:`wn.config.get_project_info `, but for all indexed projects. Example: >>> infos = wn.projects() >>> len(infos) 36 >>> infos[0]["label"] 'Open English WordNet' """ index = config.index return [ config.get_project_info(format_lexicon_specifier(project_id, version)) for project_id, project_info in index.items() for version in project_info.get("versions", []) if "resource_urls" in project_info["versions"][version] ] def lexicons(*, lexicon: str | None = "*", lang: str | None = None) -> list[Lexicon]: """Return the lexicons matching a language or lexicon specifier. Example: >>> wn.lexicons(lang="en") [, ] """ try: w = Wordnet(lang=lang, lexicon=lexicon or "*") except Error: return [] else: return w.lexicons() def reset_database(rebuild: bool = False) -> None: """Delete and recreate the database file. If *rebuild* is :python:`True`, Wn will attempt to add all lexicons that are added in the existing database. Note that this will only attempt to add indexed projects via their lexicon specifiers, (using :python:`wn.download(specifier)`) regardless of how they were originally added, and will not attempt to add resources from unindexed URLs or local files (unless those local files are cached versions of indexed resources). This function is useful when database schema changes necessitate a rebuild or when testing requires a clean database. .. warning:: This will completely delete the database and all added resources. It does not delete the download cache. Using ``rebuild=True`` does not re-add non-lexicon resources like CILI files or unindexed resources, so you will need to add those manually. """ specs = list_lexicons_safe() clear_connections() config.database_path.unlink(missing_ok=True) connect() if rebuild: for spec in specs: download(spec) clear_connections() def word(id: str, *, lexicon: str | None = None, lang: str | None = None) -> Word: """Return the word with *id* in *lexicon*. This will create a :class:`Wordnet` object using the *lang* and *lexicon* arguments. The *id* argument is then passed to the :meth:`Wordnet.word` method. >>> wn.word("ewn-cell-n") Word('ewn-cell-n') """ return Wordnet(lang=lang, lexicon=lexicon).word(id) def words( form: str | None = None, pos: str | None = None, *, lexicon: str | None = None, lang: str | None = None, ) -> list[Word]: """Return the list of matching words. This will create a :class:`Wordnet` object using the *lang* and *lexicon* arguments. The remaining arguments are passed to the :meth:`Wordnet.words` method. >>> len(wn.words()) 282902 >>> len(wn.words(pos="v")) 34592 >>> wn.words(form="scurry") [Word('ewn-scurry-n'), Word('ewn-scurry-v')] """ return Wordnet(lang=lang, lexicon=lexicon).words(form=form, pos=pos) @overload def lemmas( form: str | None = None, pos: str | None = None, *, data: Literal[False] = False, lexicon: str | None = None, lang: str | None = None, ) -> list[str]: ... @overload def lemmas( form: str | None = None, pos: str | None = None, *, data: Literal[True] = True, lexicon: str | None = None, lang: str | None = None, ) -> list[Form]: ... @overload def lemmas( form: str | None = None, pos: str | None = None, *, data: bool, lexicon: str | None = None, lang: str | None = None, ) -> list[str] | list[Form]: ... def lemmas( form: str | None = None, pos: str | None = None, *, data: bool = False, lexicon: str | None = None, lang: str | None = None, ) -> list[str] | list[Form]: """Return the list of lemmas for matching words. This will create a :class:`Wordnet` object using the *lang* and *lexicon* arguments. The remaining arguments are passed to the :meth:`Wordnet.lemmas` method. If the *data* argument is :python:`False` (the default), the lemmas are returned as :class:`str` types. If it is :python:`True`, :class:`wn.Form` objects are used instead. >>> wn.lemmas("wolves") ['wolf'] >>> wn.lemmas("wolves", data=True) [Form(value='wolf')] >>> len(wn.lemmas(pos="v")) 11617 """ return Wordnet(lang=lang, lexicon=lexicon).lemmas(form=form, pos=pos, data=data) def synset(id: str, *, lexicon: str | None = None, lang: str | None = None) -> Synset: """Return the synset with *id* in *lexicon*. This will create a :class:`Wordnet` object using the *lang* and *lexicon* arguments. The *id* argument is then passed to the :meth:`Wordnet.synset` method. >>> wn.synset("ewn-03311152-n") Synset('ewn-03311152-n') """ return Wordnet(lang=lang, lexicon=lexicon).synset(id=id) def synsets( form: str | None = None, pos: str | None = None, ili: str | None = None, *, lexicon: str | None = None, lang: str | None = None, ) -> list[Synset]: """Return the list of matching synsets. This will create a :class:`Wordnet` object using the *lang* and *lexicon* arguments. The remaining arguments are passed to the :meth:`Wordnet.synsets` method. >>> len(wn.synsets("couch")) 4 >>> wn.synsets("couch", pos="v") [Synset('ewn-00983308-v')] """ return Wordnet(lang=lang, lexicon=lexicon).synsets(form=form, pos=pos, ili=ili) def senses( form: str | None = None, pos: str | None = None, *, lexicon: str | None = None, lang: str | None = None, ) -> list[Sense]: """Return the list of matching senses. This will create a :class:`Wordnet` object using the *lang* and *lexicon* arguments. The remaining arguments are passed to the :meth:`Wordnet.senses` method. >>> len(wn.senses("twig")) 3 >>> wn.senses("twig", pos="n") [Sense('ewn-twig-n-13184889-02')] """ return Wordnet(lang=lang, lexicon=lexicon).senses(form=form, pos=pos) def sense(id: str, *, lexicon: str | None = None, lang: str | None = None) -> Sense: """Return the sense with *id* in *lexicon*. This will create a :class:`Wordnet` object using the *lang* and *lexicon* arguments. The *id* argument is then passed to the :meth:`Wordnet.sense` method. >>> wn.sense("ewn-flutter-v-01903884-02") Sense('ewn-flutter-v-01903884-02') """ return Wordnet(lang=lang, lexicon=lexicon).sense(id=id) wn-1.0.0/wn/_queries.py000066400000000000000000001123611513755206300147610ustar00rootroot00000000000000""" Database retrieval queries. """ import itertools from collections.abc import Collection, Iterator, Sequence from typing import cast from wn._db import connect from wn._exceptions import Error from wn._metadata import Metadata # Local Types Pronunciation = tuple[ str, # value str | None, # variety str | None, # notation bool, # phonemic str | None, # audio str, # lexicon specifier ] Tag = tuple[str, str, str] # tag, category, lexicon specifier Form = tuple[ str, # form str | None, # id str | None, # script str, # lexicon list[Pronunciation], # pronunciations list[Tag], # tags ] _Word = tuple[ str, # id str, # pos str, # lexicon specifier ] _Synset = tuple[ str, # id str, # pos str, # ili str, # lexicon specifier ] _Synset_Relation = tuple[ str, # rel_name str, # lexicon Metadata, # metadata str, # srcid str, # _Synset... str, str, str, ] _Definition = tuple[ str, # text str, # language str, # sourceSense str, # lexicon Metadata | None, # metadata ] _Example = tuple[ str, # text str, # language str, # lexicon Metadata | None, # metadata ] Sense = tuple[ str, # id str, # entry_id str, # synset_id str, # lexicon specifier ] _Sense_Relation = tuple[ str, # rel_name str, # lexicon Metadata, # metadata str, # Sense... str, str, str, ] _Count = tuple[int, str, Metadata] # count, lexicon, metadata _SyntacticBehaviour = tuple[ str, # id str, # frame list[str], # sense ids ] _ExistingILI = tuple[ str, # id str, # status str | None, # definition Metadata, ] _ProposedILI = tuple[ str, # synset id str, # lexicon str, # definition Metadata, ] _Lexicon = tuple[ str, # specifier str, # id str, # label str, # language str, # email str, # license str, # version str, # url str, # citation str, # logo Metadata | None, # metadata ] def resolve_lexicon_specifiers( lexicon: str, lang: str | None = None, ) -> list[str]: cur = connect().cursor() specifiers: list[str] = [] for specifier in lexicon.split(): limit = "-1" if "*" in lexicon else "1" if ":" not in specifier: specifier += ":*" query = f""" SELECT DISTINCT specifier FROM lexicons WHERE specifier GLOB :specifier AND (:language ISNULL OR language = :language) LIMIT {limit} """ params = {"specifier": specifier, "language": lang} specifiers.extend(row[0] for row in cur.execute(query, params)) # only raise an error when the query specifies something if not specifiers and (lexicon != "*" or lang is not None): raise Error(f"no lexicon found with lang={lang!r} and lexicon={lexicon!r}") return specifiers def get_lexicon(lexicon: str) -> _Lexicon: query = """ SELECT DISTINCT specifier, id, label, language, email, license, version, url, citation, logo, metadata FROM lexicons WHERE specifier = ? """ row: _Lexicon | None = connect().execute(query, (lexicon,)).fetchone() if row is None: raise LookupError(lexicon) # should we have a WnLookupError? return row def get_modified(lexicon: str) -> bool: query = "SELECT modified FROM lexicons WHERE specifier = ?" return connect().execute(query, (lexicon,)).fetchone()[0] def get_lexicon_dependencies(lexicon: str) -> list[tuple[str, str, bool]]: query = """ SELECT provider_id || ":" || provider_version, provider_url, provider_rowid FROM lexicon_dependencies JOIN lexicons AS lex ON lex.rowid = dependent_rowid WHERE lex.specifier = ? """ return [ (spec, url, rowid is not None) for spec, url, rowid in connect().execute(query, (lexicon,)) ] def get_lexicon_extension_bases(lexicon: str, depth: int = -1) -> list[str]: query = """ WITH RECURSIVE ext(x, d) AS (SELECT base_rowid, 1 FROM lexicon_extensions JOIN lexicons AS lex ON lex.rowid = extension_rowid WHERE lex.specifier = :specifier UNION SELECT base_rowid, d+1 FROM lexicon_extensions JOIN ext ON extension_rowid = x) SELECT baselex.specifier FROM ext JOIN lexicons AS baselex ON baselex.rowid = ext.x WHERE :depth < 0 OR d <= :depth ORDER BY d """ rows = connect().execute(query, {"specifier": lexicon, "depth": depth}) return [row[0] for row in rows] def get_lexicon_extensions(lexicon: str, depth: int = -1) -> list[str]: query = """ WITH RECURSIVE ext(x, d) AS (SELECT extension_rowid, 1 FROM lexicon_extensions JOIN lexicons AS lex ON lex.rowid = base_rowid WHERE lex.specifier = :specifier UNION SELECT extension_rowid, d+1 FROM lexicon_extensions JOIN ext ON base_rowid = x) SELECT extlex.specifier FROM ext JOIN lexicons AS extlex ON extlex.rowid = ext.x WHERE :depth < 0 OR d <= :depth ORDER BY d """ rows = connect().execute(query, {"specifier": lexicon, "depth": depth}) return [row[0] for row in rows] def get_ili(id: str) -> _ExistingILI | None: query = """ SELECT i.id, ist.status, i.definition, i.metadata FROM ilis AS i JOIN ili_statuses AS ist ON i.status_rowid = ist.rowid WHERE i.id = ? LIMIT 1 """ return connect().execute(query, (id,)).fetchone() def find_ilis( status: str | None = None, lexicons: Sequence[str] = (), ) -> Iterator[_ExistingILI]: query = """ SELECT DISTINCT i.id, ist.status, i.definition, i.metadata FROM ilis AS i JOIN ili_statuses AS ist ON i.status_rowid = ist.rowid """ conditions: list[str] = [] params: list = [] if status: conditions.append("ist.status = ?") params.append(status) if lexicons: # this runs much faster than just adding a condition query = """ SELECT DISTINCT i.id, ist.status, i.definition, i.metadata FROM lexicons as lex JOIN synsets AS ss ON ss.lexicon_rowid = lex.rowid JOIN ilis AS i ON i.rowid = ss.ili_rowid JOIN ili_statuses AS ist ON i.status_rowid = ist.rowid """ conditions.append(f"lex.specifier IN ({_qs(lexicons)})") params.extend(lexicons) if conditions: query += " WHERE " + "\n AND ".join(conditions) yield from connect().execute(query, params) def find_proposed_ilis( synset_id: str | None = None, lexicons: Sequence[str] = (), ) -> Iterator[_ProposedILI]: query = """ SELECT ss.id, lex.specifier, pi.definition, pi.metadata FROM proposed_ilis AS pi JOIN synsets AS ss ON ss.rowid = synset_rowid JOIN lexicons AS lex ON lex.rowid = ss.lexicon_rowid """ conditions: list[str] = [] params: list = [] if synset_id is not None: conditions.append("ss.id = ?") params.append(synset_id) if lexicons: conditions.append(f"lex.specifier IN ({_qs(lexicons)})") params.extend(lexicons) if conditions: query += " WHERE " + "\n AND ".join(conditions) yield from connect().execute(query, params) def find_entries( id: str | None = None, forms: Sequence[str] = (), pos: str | None = None, lexicons: Sequence[str] = (), normalized: bool = False, search_all_forms: bool = False, ) -> Iterator[_Word]: conn = connect() cte, cteparams, conditions, condparams = _build_entry_conditions( forms, pos, lexicons, normalized, search_all_forms ) if id: conditions.insert(0, "e.id = ?") condparams.insert(0, id) condition = "" if conditions: condition = "WHERE " + "\n AND ".join(conditions) query = f""" {cte} SELECT DISTINCT e.id, e.pos, lex.specifier FROM entries AS e JOIN lexicons AS lex ON lex.rowid = e.lexicon_rowid {condition} ORDER BY e.rowid ASC """ rows: Iterator[_Word] = conn.execute(query, cteparams + condparams) yield from rows def _load_lemmas_with_details( conn, cte: str, cteparams: list, conditions: list[str], condparams: list, with_lexicons: bool, ) -> Iterator[Form]: """Load lemmas with pronunciations and tags (full details).""" plex_cond = "AND plex.specifier IN lexspecs" if with_lexicons else "" tlex_cond = "AND tlex.specifier IN lexspecs" if with_lexicons else "" condition = "" if conditions: condition = "AND " + "\n AND ".join(conditions) query = f""" {cte} SELECT DISTINCT f.rowid, f.form, f.id, f.script, lex.specifier, p.value, p.variety, p.notation, p.phonemic, p.audio, plex.specifier, t.tag, t.category, tlex.specifier FROM forms AS f JOIN entries AS e ON e.rowid = f.entry_rowid JOIN lexicons AS lex ON lex.rowid = e.lexicon_rowid LEFT JOIN pronunciations AS p ON p.form_rowid = f.rowid LEFT JOIN lexicons AS plex ON plex.rowid = p.lexicon_rowid {plex_cond} LEFT JOIN tags AS t ON t.form_rowid = f.rowid LEFT JOIN lexicons AS tlex ON tlex.rowid = t.lexicon_rowid {tlex_cond} WHERE f.rank = 0 {condition} ORDER BY f.rowid ASC """ # Group results by form_rowid and process pronunciations/tags forms_dict: dict[ int, tuple[str, str | None, str | None, str, list[Pronunciation], list[Tag]] ] = {} for row in conn.execute(query, cteparams + condparams): form_rowid, form, form_id, script, lexicon = row[0:5] pron_data = row[5:11] tag_data = row[11:14] if form_rowid not in forms_dict: forms_dict[form_rowid] = (form, form_id, script, lexicon, [], []) # Add pronunciation if present if pron_data[0] is not None: # value pron = cast("Pronunciation", pron_data) if pron not in forms_dict[form_rowid][4]: forms_dict[form_rowid][4].append(pron) # Add tag if present if tag_data[0] is not None: # tag tag = cast("Tag", tag_data) if tag not in forms_dict[form_rowid][5]: forms_dict[form_rowid][5].append(tag) # Yield forms in order yield from forms_dict.values() def find_lemmas( forms: Sequence[str] = (), pos: str | None = None, lexicons: Sequence[str] = (), normalized: bool = False, search_all_forms: bool = False, load_details: bool = False, ) -> Iterator[Form]: """Find lemmas matching the given criteria. Returns form data for the lemma of each matching entry. If load_details is False, pronunciations and tags are not loaded. """ conn = connect() cte, cteparams, conditions, condparams = _build_entry_conditions( forms, pos, lexicons, normalized, search_all_forms ) if not load_details: # Fast path: don't load pronunciations and tags condition = "" if conditions: condition = "AND " + "\n AND ".join(conditions) query = f""" {cte} SELECT f.form, f.id, f.script, lex.specifier FROM forms AS f JOIN entries AS e ON e.rowid = f.entry_rowid JOIN lexicons AS lex ON lex.rowid = e.lexicon_rowid WHERE f.rank = 0 {condition} ORDER BY f.rowid ASC """ for row in conn.execute(query, cteparams + condparams): form, form_id, script, lexicon = row yield (form, form_id, script, lexicon, [], []) else: # Full path: load pronunciations and tags yield from _load_lemmas_with_details( conn, cte, cteparams, conditions, condparams, bool(lexicons) ) def find_senses( id: str | None = None, forms: Sequence[str] = (), pos: str | None = None, lexicons: Sequence[str] = (), normalized: bool = False, search_all_forms: bool = False, ) -> Iterator[Sense]: conn = connect() ctes: list[str] = [] params: list = [] conditions = [] order = "s.rowid" if id: conditions.append("s.id = ?") params.append(id) if forms: ctes, subquery = _query_forms(forms, normalized, search_all_forms) conditions.append(f"s.entry_rowid IN {subquery}") params.extend(forms) order = "s.lexicon_rowid, e.pos, s.entry_rank" if pos: conditions.append("e.pos = ?") params.append(pos) if lexicons: conditions.append(f"slex.specifier IN ({_qs(lexicons)})") params.extend(lexicons) cte = "" if ctes: cte = "WITH " + ",\n ".join(ctes) condition = "" if conditions: condition = "WHERE " + "\n AND ".join(conditions) query = f""" {cte} SELECT DISTINCT s.id, e.id, ss.id, slex.specifier FROM senses AS s JOIN entries AS e ON e.rowid = s.entry_rowid JOIN synsets AS ss ON ss.rowid = s.synset_rowid JOIN lexicons AS slex ON slex.rowid = s.lexicon_rowid {condition} ORDER BY {order} ASC """ rows: Iterator[Sense] = conn.execute(query, params) yield from rows def find_synsets( id: str | None = None, forms: Sequence[str] = (), pos: str | None = None, ili: str | None = None, lexicons: Sequence[str] = (), normalized: bool = False, search_all_forms: bool = False, ) -> Iterator[_Synset]: conn = connect() ctes: list[str] = [] join = "" conditions = [] order = "ss.rowid" params: list = [] if id: conditions.append("ss.id = ?") params.append(id) if forms: ctes, subquery = _query_forms(forms, normalized, search_all_forms) join = f"""\ JOIN (SELECT _s.entry_rowid, _s.synset_rowid, _s.entry_rank FROM senses AS _s WHERE _s.entry_rowid IN {subquery} ) AS s ON s.synset_rowid = ss.rowid """.strip() params.extend(forms) order = "ss.lexicon_rowid, ss.pos, s.entry_rank" if pos: conditions.append("ss.pos = ?") params.append(pos) if ili: conditions.append( "ss.ili_rowid IN (SELECT ilis.rowid FROM ilis WHERE ilis.id = ?)" ) params.append(ili) if lexicons: conditions.append(f"sslex.specifier IN ({_qs(lexicons)})") params.extend(lexicons) cte = "" if ctes: cte = "WITH " + ",\n ".join(ctes) condition = "" if conditions: condition = "WHERE " + "\n AND ".join(conditions) query = f""" {cte} SELECT DISTINCT ss.id, ss.pos, (SELECT ilis.id FROM ilis WHERE ilis.rowid=ss.ili_rowid), sslex.specifier FROM synsets AS ss JOIN lexicons AS sslex ON sslex.rowid = ss.lexicon_rowid {join} {condition} ORDER BY {order} ASC """ rows: Iterator[_Synset] = conn.execute(query, params) yield from rows def get_entry_forms(id: str, lexicons: Sequence[str]) -> Iterator[Form]: form_query = f""" WITH lexspecs(s) AS (VALUES {_vs(lexicons)}) SELECT f.rowid, f.form, f.id, f.script, lex.specifier FROM forms AS f JOIN entries AS e ON e.rowid = entry_rowid JOIN lexicons AS lex ON lex.rowid = e.lexicon_rowid WHERE e.id = ? AND lex.specifier IN lexspecs ORDER BY f.rank """ pron_query = f""" WITH lexspecs(s) AS (VALUES {_vs(lexicons)}) SELECT p.value, p.variety, p.notation, p.phonemic, p.audio, lex.specifier FROM pronunciations AS p JOIN lexicons AS lex ON lex.rowid = p.lexicon_rowid WHERE form_rowid = ? AND lex.specifier IN lexspecs """ tag_query = f""" WITH lexspecs(s) AS (VALUES {_vs(lexicons)}) SELECT t.tag, t.category, lex.specifier FROM tags AS t JOIN lexicons AS lex ON lex.rowid = t.lexicon_rowid WHERE form_rowid = ? AND lex.specifier IN lexspecs """ cur = connect().cursor() for row in cur.execute(form_query, (*lexicons, id)).fetchall(): params = (*lexicons, row[0]) prons: list[Pronunciation] = cur.execute(pron_query, params).fetchall() tags: list[Tag] = cur.execute(tag_query, params).fetchall() yield (*row[1:], prons, tags) def get_synsets_for_ilis( ilis: Collection[str], lexicons: Sequence[str], ) -> Iterator[_Synset]: conn = connect() query = f""" SELECT DISTINCT ss.id, ss.pos, ili.id, sslex.specifier FROM synsets as ss JOIN ilis as ili ON ss.ili_rowid = ili.rowid JOIN lexicons AS sslex ON sslex.rowid = ss.lexicon_rowid WHERE ili.id IN ({_qs(ilis)}) AND sslex.specifier IN ({_qs(lexicons)}) """ params = *ilis, *lexicons result_rows: Iterator[_Synset] = conn.execute(query, params) yield from result_rows def get_synset_relations( synset_id: str, synset_lexicon: str, relation_types: Collection[str], lexicons: Sequence[str], target_lexicons: Sequence[str], ) -> Iterator[_Synset_Relation]: conn = connect() params: list = [] constraint = "" if relation_types and "*" not in relation_types: constraint = f"WHERE type IN ({_qs(relation_types)})" params.extend(relation_types) params.extend(lexicons) params.extend(target_lexicons) params.append(synset_id) params.append(synset_lexicon) query = f""" WITH reltypes(rowid) AS (SELECT rowid FROM relation_types {constraint}), lexrowids(rowid) AS (SELECT rowid FROM lexicons WHERE specifier IN ({_vs(lexicons)})), tgtlexrowids(rowid) AS (SELECT rowid FROM lexicons WHERE specifier IN ({_vs(target_lexicons)})), srcsynset(rowid) AS (SELECT ss.rowid FROM synsets AS ss JOIN lexicons AS lex ON lex.rowid = ss.lexicon_rowid WHERE ss.id = ? AND lex.specifier = ?), matchingrels(rowid) AS (SELECT srel.rowid FROM synset_relations AS srel WHERE srel.source_rowid IN srcsynset AND srel.lexicon_rowid IN lexrowids AND srel.type_rowid IN reltypes) SELECT DISTINCT rt.type, lex.specifier, srel.metadata, src.id, tgt.id, tgt.pos, tgtili.id, tgtlex.specifier FROM matchingrels AS mr JOIN synset_relations AS srel ON srel.rowid=mr.rowid JOIN relation_types AS rt ON rt.rowid=srel.type_rowid JOIN synsets AS src ON src.rowid = srel.source_rowid JOIN synsets AS tgt ON tgt.rowid = srel.target_rowid JOIN lexicons AS lex ON lex.rowid = srel.lexicon_rowid JOIN lexicons AS tgtlex ON tgtlex.rowid = tgt.lexicon_rowid LEFT JOIN ilis AS tgtili ON tgtili.rowid = tgt.ili_rowid -- might be null WHERE tgt.lexicon_rowid IN tgtlexrowids -- ensure target is included """ result_rows: Iterator[_Synset_Relation] = conn.execute(query, params) yield from result_rows def get_expanded_synset_relations( ili_id: str, relation_types: Collection[str], expands: Sequence[str], ) -> Iterator[_Synset_Relation]: conn = connect() params: list = [] constraint = "" if relation_types and "*" not in relation_types: constraint = f"WHERE type IN ({_qs(relation_types)})" params.extend(relation_types) params.extend(expands) params.append(ili_id) query = f""" WITH reltypes(rowid) AS (SELECT rowid FROM relation_types {constraint}), lexrowids(rowid) AS (SELECT rowid FROM lexicons WHERE specifier IN ({_vs(expands)})), srcsynset(rowid) AS (SELECT ss.rowid FROM synsets AS ss JOIN ilis ON ilis.rowid = ss.ili_rowid WHERE ilis.id = ? AND ss.lexicon_rowid IN lexrowids), matchingrels(rowid) AS (SELECT srel.rowid FROM synset_relations AS srel WHERE srel.source_rowid IN srcsynset AND srel.lexicon_rowid IN lexrowids AND srel.type_rowid IN reltypes) SELECT DISTINCT rt.type, lex.specifier, srel.metadata, src.id, tgt.id, tgt.pos, tgtili.id, tgtlex.specifier FROM matchingrels AS mr JOIN synset_relations AS srel ON srel.rowid=mr.rowid JOIN relation_types AS rt ON rt.rowid=srel.type_rowid JOIN synsets AS src ON src.rowid = srel.source_rowid JOIN synsets AS tgt ON tgt.rowid = srel.target_rowid JOIN ilis AS tgtili ON tgtili.rowid = tgt.ili_rowid JOIN lexicons AS lex ON lex.rowid = srel.lexicon_rowid JOIN lexicons AS tgtlex ON tgtlex.rowid = tgt.lexicon_rowid """ result_rows: Iterator[_Synset_Relation] = conn.execute(query, params) yield from result_rows def get_definitions( synset_id: str, lexicons: Sequence[str], ) -> list[_Definition]: conn = connect() query = f""" SELECT d.definition, d.language, (SELECT s.id FROM senses AS s WHERE s.rowid=d.sense_rowid), lex.specifier, d.metadata FROM definitions AS d JOIN synsets AS ss ON ss.rowid = d.synset_rowid JOIN lexicons AS lex ON lex.rowid = d.lexicon_rowid WHERE ss.id = ? AND lex.specifier IN ({_qs(lexicons)}) """ return conn.execute(query, (synset_id, *lexicons)).fetchall() _SANITIZED_EXAMPLE_PREFIXES = { "senses": "sense", "synsets": "synset", } def get_examples( id: str, table: str, lexicons: Sequence[str], ) -> list[_Example]: conn = connect() prefix = _SANITIZED_EXAMPLE_PREFIXES.get(table) if prefix is None: raise Error(f"'{table}' does not have examples") query = f""" SELECT ex.example, ex.language, lex.specifier, ex.metadata FROM {prefix}_examples AS ex JOIN {table} AS tbl ON tbl.rowid = ex.{prefix}_rowid JOIN lexicons AS lex ON lex.rowid = ex.lexicon_rowid WHERE tbl.id = ? AND lex.specifier IN ({_qs(lexicons)}) """ return conn.execute(query, (id, *lexicons)).fetchall() def find_syntactic_behaviours( id: str | None = None, lexicons: Sequence[str] = (), ) -> Iterator[_SyntacticBehaviour]: conn = connect() query = """ SELECT sb.id, sb.frame, s.id FROM syntactic_behaviours AS sb JOIN syntactic_behaviour_senses AS sbs ON sbs.syntactic_behaviour_rowid = sb.rowid JOIN senses AS s ON s.rowid = sbs.sense_rowid JOIN lexicons AS lex ON lex.rowid = sb.lexicon_rowid """ conditions: list[str] = [] params: list = [] if id: conditions.append("sb.id = ?") params.append(id) if lexicons: conditions.append(f"lex.specifier IN ({_qs(lexicons)})") params.extend(lexicons) if conditions: query += "\n WHERE " + "\n AND ".join(conditions) rows: Iterator[tuple[str, str, str]] = conn.execute(query, params) for key, group in itertools.groupby(rows, lambda row: row[0:2]): id, frame = cast("tuple[str, str]", key) sense_ids = [row[2] for row in group] yield id, frame, sense_ids def get_syntactic_behaviours( sense_id: str, lexicons: Sequence[str], ) -> list[str]: conn = connect() query = f""" SELECT sb.frame FROM syntactic_behaviours AS sb JOIN syntactic_behaviour_senses AS sbs ON sbs.syntactic_behaviour_rowid = sb.rowid JOIN senses AS s ON s.rowid = sbs.sense_rowid JOIN lexicons AS lex ON lex.rowid = sb.lexicon_rowid WHERE s.id = ? AND lex.specifier IN ({_qs(lexicons)}) """ return [row[0] for row in conn.execute(query, (sense_id, *lexicons))] def _get_senses( id: str, sourcetype: str, lexicons: Sequence[str], order_by_rank: bool = True ) -> Iterator[Sense]: conn = connect() match sourcetype: case "entry": sourcealias = "e" case "synset": sourcealias = "ss" case _: raise Error(f"invalid sense source type: {sourcetype}") order_col = f"{sourcetype}_rank" if order_by_rank else "rowid" query = f""" SELECT s.id, e.id, ss.id, slex.specifier FROM senses AS s JOIN entries AS e ON e.rowid = s.entry_rowid JOIN synsets AS ss ON ss.rowid = s.synset_rowid JOIN lexicons AS slex ON slex.rowid = s.lexicon_rowid WHERE {sourcealias}.id = ? AND slex.specifier IN ({_qs(lexicons)}) ORDER BY s.{order_col} """ return conn.execute(query, (id, *lexicons)) def get_entry_senses( sense_id: str, lexicons: Sequence[str], order_by_rank: bool = True ) -> Iterator[Sense]: yield from _get_senses(sense_id, "entry", lexicons, order_by_rank) def get_synset_members( synset_id: str, lexicons: Sequence[str], order_by_rank: bool = True ) -> Iterator[Sense]: yield from _get_senses(synset_id, "synset", lexicons, order_by_rank) def get_sense_relations( sense_id: str, relation_types: Collection[str], lexicons: Sequence[str], target_lexicons: Sequence[str], ) -> Iterator[_Sense_Relation]: params: list = [] constraint = "" if relation_types and "*" not in relation_types: constraint = f"WHERE type IN ({_qs(relation_types)})" params.extend(relation_types) params.extend(lexicons) params.extend(target_lexicons) params.append(sense_id) query = f""" WITH rt(rowid, type) AS (SELECT rowid, type FROM relation_types {constraint}), lexrowids(rowid) AS (SELECT rowid FROM lexicons WHERE specifier IN ({_vs(lexicons)})), tgtlexrowids(rowid) AS (SELECT rowid FROM lexicons WHERE specifier IN ({_vs(target_lexicons)})) SELECT DISTINCT rel.type, rel.lexicon, rel.metadata, s.id, e.id, ss.id, slex.specifier FROM (SELECT rt.type, lex.specifier AS lexicon, srel.metadata AS metadata, target_rowid FROM sense_relations AS srel JOIN rt ON srel.type_rowid = rt.rowid JOIN lexicons AS lex ON srel.lexicon_rowid = lex.rowid JOIN senses AS s ON s.rowid = srel.source_rowid WHERE s.id = ? AND srel.lexicon_rowid IN lexrowids ) AS rel JOIN senses AS s ON s.rowid = rel.target_rowid AND s.lexicon_rowid IN tgtlexrowids JOIN lexicons AS slex ON slex.rowid = s.lexicon_rowid JOIN entries AS e ON e.rowid = s.entry_rowid JOIN synsets AS ss ON ss.rowid = s.synset_rowid """ rows: Iterator[_Sense_Relation] = connect().execute(query, params) yield from rows def get_sense_synset_relations( sense_id: str, relation_types: Collection[str], lexicons: Sequence[str], target_lexicons: Sequence[str], ) -> Iterator[_Synset_Relation]: params: list = [] constraint = "" if "*" not in relation_types: constraint = f"WHERE type IN ({_qs(relation_types)})" params.extend(relation_types) params.extend(lexicons) params.extend(target_lexicons) params.append(sense_id) query = f""" WITH rt(rowid, type) AS (SELECT rowid, type FROM relation_types {constraint}), lexrowids(rowid) AS (SELECT rowid FROM lexicons WHERE specifier IN ({_vs(lexicons)})), tgtlexrowids(rowid) AS (SELECT rowid FROM lexicons WHERE specifier IN ({_vs(target_lexicons)})) SELECT DISTINCT rel.type, rel.lexicon, rel.metadata, rel.source_rowid, tgt.id, tgt.pos, (SELECT ilis.id FROM ilis WHERE ilis.rowid = tgt.ili_rowid), tgtlex.specifier FROM (SELECT rt.type, lex.specifier AS lexicon, srel.metadata AS metadata, source_rowid, target_rowid FROM sense_synset_relations AS srel JOIN rt ON srel.type_rowid = rt.rowid JOIN lexicons AS lex ON srel.lexicon_rowid = lex.rowid JOIN senses AS s ON s.rowid = srel.source_rowid WHERE s.id = ? AND srel.lexicon_rowid IN lexrowids ) AS rel JOIN synsets AS tgt ON tgt.rowid = rel.target_rowid AND tgt.lexicon_rowid IN tgtlexrowids JOIN lexicons AS tgtlex ON tgtlex.rowid = tgt.lexicon_rowid """ rows: Iterator[_Synset_Relation] = connect().execute(query, params) yield from rows def get_relation_targets( rel_table: str, tgt_table: str, lexicons: Sequence[str], target_lexicons: Sequence[str], ) -> set[str]: if rel_table not in { "sense_relations", "sense_synset_relations", "synset_relations", }: raise ValueError(f"invalid relation table: {rel_table}") if tgt_table not in ("senses", "synsets"): raise ValueError(f"invalid target table: {tgt_table}") params: list = [*lexicons, *target_lexicons] query = f""" WITH lexrowids(rowid) AS (SELECT rowid FROM lexicons WHERE specifier IN ({_vs(lexicons)})), tgtlexrowids(rowid) AS (SELECT rowid FROM lexicons WHERE specifier IN ({_vs(target_lexicons)})) SELECT DISTINCT tgt.id FROM {rel_table} AS srel JOIN lexicons AS lex ON srel.lexicon_rowid = lex.rowid JOIN {tgt_table} AS tgt ON tgt.rowid = srel.target_rowid WHERE srel.lexicon_rowid IN lexrowids AND tgt.lexicon_rowid IN tgtlexrowids """ rows: Iterator[str] = connect().execute(query, params) return {row[0] for row in rows} _SANITIZED_METADATA_TABLES = { # 'ilis': 'ilis', # 'proposed_ilis': 'proposed_ilis', # 'lexicons': 'lexicons', "entries": "entries", "senses": "senses", "synsets": "synsets", # 'sense_relations': 'sense_relations', # 'sense_synset_relations': 'sense_synset_relations', # 'synset_relations': 'synset_relations', # 'sense_examples': 'sense_examples', # 'counts': 'counts', # 'synset_examples': 'synset_examples', # 'definitions': 'definitions', } def get_metadata(id: str, lexicon: str, table: str) -> Metadata: tablename = _SANITIZED_METADATA_TABLES.get(table) if tablename is None: raise Error(f"'{table}' does not contain metadata") query = f""" SELECT tbl.metadata FROM {tablename} AS tbl JOIN lexicons AS lex ON lex.rowid = lexicon_rowid WHERE tbl.id=? AND lex.specifier = ? """ return cast( "Metadata", connect().execute(query, (id, lexicon)).fetchone()[0] or {}, ) # TODO: benchmark using a TypeGuard def get_ili_metadata(id: str) -> Metadata: query = "SELECT metadata FROM ilis WHERE id = ?" return cast( "Metadata", connect().execute(query, (id,)).fetchone()[0] or {}, ) def get_proposed_ili_metadata(synset: str, lexicon: str) -> Metadata: query = """ SELECT pili.metadata FROM proposed_ilis AS pili JOIN synsets AS ss ON ss.rowid = synset_rowid JOIN lexicons AS lex ON lex.rowid = ss.lexicon_rowid WHERE ss.id = ? AND lex.specifier = ? """ return cast( "Metadata", connect().execute(query, (synset, lexicon)).fetchone()[0] or {}, ) _SANITIZED_LEXICALIZED_TABLES = { "senses": ("senses", "sense_rowid"), "synsets": ("synsets", "synset_rowid"), } def get_lexicalized(id: str, lexicon: str, table: str) -> bool: conn = connect() if table not in _SANITIZED_LEXICALIZED_TABLES: raise Error(f"'{table}' does not mark lexicalization") tablename, column = _SANITIZED_LEXICALIZED_TABLES[table] if not id or not lexicon: return False query = f""" SELECT NOT EXISTS (SELECT {column} FROM unlexicalized_{tablename} AS un JOIN {tablename} AS tbl ON tbl.rowid = un.{column} JOIN lexicons AS lex ON lex.rowid = tbl.lexicon_rowid WHERE tbl.id = ? AND lex.specifier = ?) """ return bool(conn.execute(query, (id, lexicon)).fetchone()[0]) def get_adjposition(sense_id: str, lexicon: str) -> str | None: conn = connect() query = """ SELECT adjposition FROM adjpositions JOIN senses AS s ON s.rowid = sense_rowid JOIN lexicons AS lex ON lex.rowid = s.lexicon_rowid WHERE s.id = ? AND lex.specifier = ? """ row = conn.execute(query, (sense_id, lexicon)).fetchone() if row: return row[0] return None def get_sense_counts(sense_id: str, lexicons: Sequence[str]) -> list[_Count]: conn = connect() query = f""" SELECT c.count, lex.specifier, c.metadata FROM counts AS c JOIN senses AS s ON s.rowid = c.sense_rowid JOIN lexicons AS lex ON lex.rowid = c.lexicon_rowid WHERE s.id = ? AND lex.specifier IN ({_qs(lexicons)}) """ rows: list[_Count] = conn.execute(query, (sense_id, *lexicons)).fetchall() return rows def get_lexfile(synset_id: str, lexicon: str) -> str | None: conn = connect() query = """ SELECT lf.name FROM lexfiles AS lf JOIN synsets AS ss ON ss.lexfile_rowid = lf.rowid JOIN lexicons AS lex ON lex.rowid = ss.lexicon_rowid WHERE ss.id = ? AND lex.specifier = ? """ row = conn.execute(query, (synset_id, lexicon)).fetchone() if row is not None and row[0] is not None: return row[0] return None def get_entry_index(entry_id: str, lexicon: str) -> str | None: conn = connect() query = """ SELECT idx.lemma FROM entries AS e JOIN lexicons AS lex ON lex.rowid = e.lexicon_rowid JOIN entry_index AS idx ON idx.entry_rowid = e.rowid WHERE e.id = ? AND lex.specifier = ? """ row = conn.execute(query, (entry_id, lexicon)).fetchone() if row is not None: return row[0] return None def get_sense_n(sense_id: str, lexicon: str) -> int | None: conn = connect() query = """ SELECT s.entry_rank FROM senses AS s JOIN lexicons AS lex ON lex.rowid = s.lexicon_rowid WHERE s.id = ? AND lex.specifier = ? """ row = conn.execute(query, (sense_id, lexicon)).fetchone() if row is not None: return row[0] return None def _qs(xs: Collection) -> str: return ",".join("?" * len(xs)) def _vs(xs: Collection) -> str: return ",".join(["(?)"] * len(xs)) def _kws(xs: Collection) -> str: return ",".join(f":{x}" for x in xs) def _query_forms( forms: Sequence[str], normalized: bool, search_all_forms: bool, indexed: bool = True, ) -> tuple[list[str], str]: or_norm = "OR f.normalized_form IN wordforms" if normalized else "" and_rank = "" if search_all_forms else "AND f.rank = 0" ctes: list[str] = [ f"wordforms(s) AS (VALUES {_vs(forms)})", f"""matched_entries(rowid) AS (SELECT f.entry_rowid FROM forms AS f WHERE (f.form IN wordforms {or_norm}) {and_rank})""", ] subquery = "matched_entries" if indexed: subquery = """\ (SELECT rowid FROM matched_entries UNION SELECT idx.entry_rowid FROM matched_entries AS _me JOIN entry_index AS _idx ON _idx.entry_rowid = _me.rowid JOIN entry_index AS idx ON idx.lemma = _idx.lemma) """ return ctes, subquery def _build_entry_conditions( forms: Sequence[str], pos: str | None, lexicons: Sequence[str], normalized: bool, search_all_forms: bool, ) -> tuple[str, list[str], list[str], list[str]]: """Build CTE, conditions, and parameters for entry-based queries. Returns: tuple of (cte, conditions, params) """ ctes: list[str] = [] cteparams: list[str] = [] subquery = "" conditions: list[str] = [] condparams: list[str] = [] if lexicons: ctes.append(f"lexspecs(s) AS (VALUES {_vs(lexicons)})") conditions.append("lex.specifier IN lexspecs") cteparams.extend(lexicons) if forms: ctes_, subquery = _query_forms(forms, normalized, search_all_forms) ctes.extend(ctes_) conditions.append(f"e.rowid IN {subquery}") cteparams.extend(forms) if pos: conditions.append("e.pos = ?") condparams.append(pos) cte = "" if ctes: cte = "WITH " + ",\n ".join(ctes) return cte, cteparams, conditions, condparams wn-1.0.0/wn/_types.py000066400000000000000000000020341513755206300144430ustar00rootroot00000000000000from collections.abc import Callable, Mapping, Sequence from pathlib import Path from typing import Any, TypeAlias # For the below, use type statement instead of TypeAlias from Python 3.12 # For functions taking a filesystem path as a str or a pathlib.Path AnyPath: TypeAlias = str | Path # LMF versions for comparison VersionInfo: TypeAlias = tuple[int, ...] # Synset and Sense relations map a relation type to one or more ids RelationMap: TypeAlias = Mapping[str, Sequence[str]] # User-facing metadata representation Metadata: TypeAlias = dict[str, Any] # A callable that returns a normalized word form for a given word form NormalizeFunction: TypeAlias = Callable[[str], str] # Lemmatization returns a mapping of parts of speech (or None) to # lists of wordforms that are potential lemmas for some query word LemmatizeResult: TypeAlias = dict[str | None, set[str]] # A callable that returns a LemmatizationResult for a given word form # and optional part of speech LemmatizeFunction: TypeAlias = Callable[[str, str | None], LemmatizeResult] wn-1.0.0/wn/_util.py000066400000000000000000000040021513755206300142510ustar00rootroot00000000000000"""Non-public Wn utilities.""" import hashlib from collections.abc import Hashable, Iterable from pathlib import Path from typing import TypeVar from unicodedata import combining, normalize from wn._types import VersionInfo def version_info(version_string: str) -> VersionInfo: return tuple(map(int, version_string.split("."))) def is_url(string: str) -> bool: """Return True if *string* appears to be a URL.""" # TODO: ETags? return any(string.startswith(scheme) for scheme in ("http://", "https://")) def is_gzip(path: Path) -> bool: """Return True if the file at *path* appears to be gzipped.""" return _inspect_file_signature(path, b"\x1f\x8b") def is_lzma(path: Path) -> bool: """Return True if the file at *path* appears to be lzma-compressed.""" return _inspect_file_signature(path, b"\xfd7zXZ\x00") def is_xml(path: Path) -> bool: """Return True if the file at *path* appears to be an XML file.""" return _inspect_file_signature(path, b" bool: if path.is_file(): with path.open("rb") as f: return f.read(len(signature)) == signature return False def short_hash(string: str) -> str: """Return a short hash of *string*.""" b2 = hashlib.blake2b(digest_size=20) b2.update(string.encode("utf-8")) return b2.hexdigest() T = TypeVar("T") def flatten(iterable: Iterable[Iterable[T]]) -> list[T]: return [x for xs in iterable for x in xs] H = TypeVar("H", bound=Hashable) def unique_list(items: Iterable[H]) -> list[H]: # use a dictionary as an order-preserving set targets = dict.fromkeys(items, True) return list(targets) def normalize_form(s: str) -> str: return "".join(c for c in normalize("NFKD", s.casefold()) if not combining(c)) def format_lexicon_specifier(id: str, version: str) -> str: return f"{id}:{version}" def split_lexicon_specifier(lexicon: str) -> tuple[str, str]: id, _, ver = lexicon.partition(":") return id, ver wn-1.0.0/wn/_wordnet.py000066400000000000000000000374361513755206300147770ustar00rootroot00000000000000import textwrap import warnings from collections.abc import Callable, Iterator, Sequence from typing import Literal, TypeVar, overload from wn._core import Form, Pronunciation, Sense, Synset, Tag, Word from wn._exceptions import Error, WnWarning from wn._lexicon import Lexicon, LexiconConfiguration from wn._queries import ( find_entries, find_lemmas, find_senses, find_synsets, get_lexicon_dependencies, resolve_lexicon_specifiers, ) from wn._types import ( LemmatizeFunction, NormalizeFunction, ) from wn._util import normalize_form # Useful for factory functions of Word, Sense, or Synset C = TypeVar("C", Word, Sense, Synset) class Wordnet: """Class for interacting with wordnet data. A wordnet object acts essentially as a filter by first selecting matching lexicons and then searching only within those lexicons for later queries. Lexicons can be selected on instantiation with the *lexicon* or *lang* parameters. The *lexicon* parameter is a string with a space-separated list of :ref:`lexicon specifiers `. The *lang* argument is a `BCP 47`_ language code that selects any lexicon matching the given language code. As the *lexicon* argument more precisely selects lexicons, it is the recommended method of instantiation. Omitting both *lexicon* and *lang* arguments triggers :ref:`default-mode ` queries. Some wordnets were created by translating the words from a larger wordnet, namely the Princeton WordNet, and then relying on the larger wordnet for structural relations. An *expand* argument is a second space-separated list of lexicon specifiers which are used for traversing relations, but not as the results of queries. Setting *expand* to an empty string (:python:`expand=''`) disables expand lexicons. For more information, see :ref:`cross-lingual-relation-traversal`. The *normalizer* argument takes a callable that normalizes word forms in order to expand the search. The default function downcases the word and removes diacritics via NFKD_ normalization so that, for example, searching for *san josé* in the English WordNet will find the entry for *San Jose*. Setting *normalizer* to :python:`None` disables normalization and forces exact-match searching. For more information, see :ref:`normalization`. The *lemmatizer* argument may be :python:`None`, which is the default and disables lemmatizer-based query expansion, or a callable that takes a word form and optional part of speech and returns base forms of the original word. To support lemmatizers that use the wordnet for instantiation, such as :mod:`wn.morphy`, the lemmatizer may be assigned to the :attr:`lemmatizer` attribute after creation. For more information, see :ref:`lemmatization`. If the *search_all_forms* argument is :python:`True` (the default), searches of word forms consider all forms in the lexicon; if :python:`False`, only lemmas are searched. Non-lemma forms may include, depending on the lexicon, morphological exceptions, alternate scripts or spellings, etc. .. _BCP 47: https://en.wikipedia.org/wiki/IETF_language_tag .. _NFKD: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms Attributes: lemmatizer: A lemmatization function or :python:`None`. """ __slots__ = ( "_default_mode", "_lexconf", "_normalizer", "_search_all_forms", "lemmatizer", ) __module__ = "wn" def __init__( self, lexicon: str | None = None, *, lang: str | None = None, expand: str | None = None, normalizer: NormalizeFunction | None = normalize_form, lemmatizer: LemmatizeFunction | None = None, search_all_forms: bool = True, ): if lexicon or lang: lexicons = tuple(resolve_lexicon_specifiers(lexicon or "*", lang=lang)) else: lexicons = () if lang and len(lexicons) > 1: warnings.warn( f"multiple lexicons match {lang=}: {lexicons!r}; " "use the lexicon parameter instead to avoid this warning", WnWarning, stacklevel=2, ) # default mode means any lexicon is searched or expanded upon, # but relation traversals only target the source's lexicon default_mode = not lexicon and not lang expand = _resolve_lexicon_dependencies(expand, lexicons, default_mode) expands = tuple(resolve_lexicon_specifiers(expand)) if expand else () self._lexconf = LexiconConfiguration( lexicons=lexicons, expands=expands, default_mode=default_mode, ) self._normalizer = normalizer self.lemmatizer = lemmatizer self._search_all_forms = search_all_forms def lexicons(self) -> list[Lexicon]: """Return the list of lexicons covered by this wordnet.""" return list(map(Lexicon.from_specifier, self._lexconf.lexicons)) def expanded_lexicons(self) -> list[Lexicon]: """Return the list of expand lexicons for this wordnet.""" return list(map(Lexicon.from_specifier, self._lexconf.expands)) def word(self, id: str) -> Word: """Return the first word in this wordnet with identifier *id*.""" iterable = find_entries(id=id, lexicons=self._lexconf.lexicons) try: id, pos, lex = next(iterable) return Word(id, pos, _lexicon=lex, _lexconf=self._lexconf) except StopIteration: raise Error(f"no such lexical entry: {id}") from None def words(self, form: str | None = None, pos: str | None = None) -> list[Word]: """Return the list of matching words in this wordnet. Without any arguments, this function returns all words in the wordnet's selected lexicons. A *form* argument restricts the words to those matching the given word form, and *pos* restricts words by their part of speech. """ return _find_helper(self, Word, find_entries, form, pos) @overload def lemmas( self, form: str | None = None, pos: str | None = None, *, data: Literal[False] = False, ) -> list[str]: ... @overload def lemmas( self, form: str | None = None, pos: str | None = None, *, data: Literal[True] = True, ) -> list[Form]: ... # fallback for non-literal bool argument @overload def lemmas( self, form: str | None = None, pos: str | None = None, *, data: bool ) -> list[str] | list[Form]: ... def lemmas( self, form: str | None = None, pos: str | None = None, *, data: bool = False ) -> list[str] | list[Form]: """Return the list of lemmas for matching words in this wordnet. Without any arguments, this function returns all distinct lemma forms in the wordnet's selected lexicons. A *form* argument restricts the words to those matching the given word form, and *pos* restricts words by their part of speech. If the *data* argument is :python:`False` (the default), only distinct lemma forms are returned as :class:`str` types. If it is :python:`True`, :class:`wn.Form` objects are returned for all matching entries, which may include multiple Form objects with the same lemma string. Example: >>> wn.Wordnet().lemmas("wolves") ['wolf'] >>> wn.Wordnet().lemmas("wolves", data=True) [Form(value='wolf')] """ form_data = _find_lemmas(self, form, pos, load_details=data) if data: return [ Form( form, id=id, script=script, _lexicon=lex, _pronunciations=tuple(Pronunciation(*p) for p in prons), _tags=tuple(Tag(*t) for t in tags), ) for form, id, script, lex, prons, tags in form_data ] # When data=False, extract and deduplicate strings return list(dict.fromkeys(fd[0] for fd in form_data)) def synset(self, id: str) -> Synset: """Return the first synset in this wordnet with identifier *id*.""" iterable = find_synsets(id=id, lexicons=self._lexconf.lexicons) try: id, pos, ili, lex = next(iterable) return Synset(id, pos, ili=ili, _lexicon=lex, _lexconf=self._lexconf) except StopIteration: raise Error(f"no such synset: {id}") from None def synsets( self, form: str | None = None, pos: str | None = None, ili: str | None = None ) -> list[Synset]: """Return the list of matching synsets in this wordnet. Without any arguments, this function returns all synsets in the wordnet's selected lexicons. A *form* argument restricts synsets to those whose member words match the given word form. A *pos* argument restricts synsets to those with the given part of speech. An *ili* argument restricts synsets to those with the given interlingual index; generally this should select a unique synset within a single lexicon. """ return _find_helper(self, Synset, find_synsets, form, pos, ili=ili) def sense(self, id: str) -> Sense: """Return the first sense in this wordnet with identifier *id*.""" iterable = find_senses(id=id, lexicons=self._lexconf.lexicons) try: id, eid, ssid, lex = next(iterable) return Sense(id, eid, ssid, _lexicon=lex, _lexconf=self._lexconf) except StopIteration: raise Error(f"no such sense: {id}") from None def senses(self, form: str | None = None, pos: str | None = None) -> list[Sense]: """Return the list of matching senses in this wordnet. Without any arguments, this function returns all senses in the wordnet's selected lexicons. A *form* argument restricts the senses to those whose word matches the given word form, and *pos* restricts senses by their word's part of speech. """ return _find_helper(self, Sense, find_senses, form, pos) def describe(self) -> str: """Return a formatted string describing the lexicons in this wordnet. Example: >>> oewn = wn.Wordnet("oewn:2021") >>> print(oewn.describe()) Primary lexicons: oewn:2021 Label : Open English WordNet URL : https://github.com/globalwordnet/english-wordnet License: https://creativecommons.org/licenses/by/4.0/ Words : 163161 (a: 8386, n: 123456, r: 4481, s: 15231, v: 11607) Senses : 211865 Synsets: 120039 (a: 7494, n: 84349, r: 3623, s: 10727, v: 13846) ILIs : 120039 """ substrings = ["Primary lexicons:"] for lex in self.lexicons(): substrings.append(textwrap.indent(lex.describe(), " ")) if self._lexconf.expands: substrings.append("Expand lexicons:") for lex in self.expanded_lexicons(): substrings.append(textwrap.indent(lex.describe(full=False), " ")) return "\n".join(substrings) def _resolve_lexicon_dependencies( expand: str | None, lexicons: Sequence[str], default_mode: bool, ) -> str: if expand is not None: return expand.strip() if default_mode: return "*" # find dependencies specified by the lexicons deps = [ (depspec, added) for lexspec in lexicons for depspec, _, added in get_lexicon_dependencies(lexspec) ] missing = " ".join(spec for spec, added in deps if not added) if missing: warnings.warn( f"lexicon dependencies not available: {missing}", WnWarning, stacklevel=3, ) return " ".join(spec for spec, added in deps if added) def _find_lemmas( w: Wordnet, form: str | None, pos: str | None, load_details: bool = False ) -> Iterator[tuple]: """Return an iterator of matching lemma form data. This works like _find_helper but returns raw form tuples instead of Word/Sense/Synset objects. The load_details parameter controls whether pronunciations and tags are loaded from the database. """ kwargs: dict = { "lexicons": w._lexconf.lexicons, "search_all_forms": w._search_all_forms, "load_details": load_details, } # easy case is when there is no form if form is None: yield from find_lemmas(pos=pos, **kwargs) return # if there's a form, we may need to lemmatize and normalize normalize = w._normalizer kwargs["normalized"] = bool(normalize) lemmatize = w.lemmatizer forms = lemmatize(form, pos) if lemmatize else {} # if no lemmatizer or word not covered by lemmatizer, back off to # the original form and pos if not forms: forms = {pos: {form}} yield from _query_with_forms(find_lemmas, forms, normalize, kwargs) def _query_with_forms( query_func: Callable, forms: dict[str | None, set[str]], normalize: NormalizeFunction | None, kwargs: dict, ) -> list[tuple]: """Query database with forms, falling back to normalized forms if needed. Queries the database for each pos/forms combination. If a normalizer is available and the original forms return no results, queries again with normalized forms. """ results = [] for _pos, _forms in forms.items(): results.extend(query_func(forms=_forms, pos=_pos, **kwargs)) # Only try normalized forms if we got no results with original forms if not results and normalize: for _pos, _forms in forms.items(): normalized_forms = [normalize(f) for f in _forms] results.extend(query_func(forms=normalized_forms, pos=_pos, **kwargs)) return results def _find_helper( w: Wordnet, cls: type[C], query_func: Callable, form: str | None, pos: str | None, ili: str | None = None, ) -> list[C]: """Return the list of matching wordnet entities. If the wordnet has a normalizer and the search includes a word form, the original word form is searched against both the original and normalized columns in the database. Then, if no results are found, the search is repeated with the normalized form. If the wordnet does not have a normalizer, only exact string matches are used. """ kwargs: dict = { "lexicons": w._lexconf.lexicons, "search_all_forms": w._search_all_forms, } if ili: kwargs["ili"] = ili # easy case is when there is no form # (for type checking, it is hard to guess the correct number of # fields in data, so ignore here and further down) if form is None: return [ cls(*data, _lexconf=w._lexconf) # type: ignore for data in query_func(pos=pos, **kwargs) ] # if there's a form, we may need to lemmatize and normalize normalize = w._normalizer kwargs["normalized"] = bool(normalize) lemmatize = w.lemmatizer forms = lemmatize(form, pos) if lemmatize else {} # if no lemmatizer or word not covered by lemmatizer, back off to # the original form and pos if not forms: forms = {pos: {form}} results_data = _query_with_forms(query_func, forms, normalize, kwargs) # we want unique results here, but a set can make the order # erratic, so filter manually results = [ cls(*data, _lexconf=w._lexconf) # type: ignore for data in results_data ] unique_results: list[C] = [] seen: set[C] = set() for result in results: if result not in seen: unique_results.append(result) seen.add(result) return unique_results wn-1.0.0/wn/compat/000077500000000000000000000000001513755206300140525ustar00rootroot00000000000000wn-1.0.0/wn/compat/__init__.py000066400000000000000000000000001513755206300161510ustar00rootroot00000000000000wn-1.0.0/wn/compat/sensekey.py000066400000000000000000000236741513755206300162660ustar00rootroot00000000000000"""Functions Related to Sense Keys Sense keys are identifiers of senses that (mostly) persist across wordnet versions. They are only used by the English wordnets. For the OMW lexicons derived from the Princeton WordNet and the EWN 2019/2020 lexicons, the sense key is encoded in the ``identifier`` metadata of a Sense: >>> import wn >>> en = wn.Wordnet("omw-en:1.4") >>> sense = en.sense("omw-en-carrousel-02966372-n") >>> sense.metadata() {'identifier': 'carrousel%1:06:01::'} For OEWN 2021+ lexicons, the sense key is encoded in the sense ID, but some characters are escaped or replaced to ensure it is a valid XML ID. >>> oewn = wn.Wordnet("oewn:2024") >>> sense = oewn.sense("oewn-carousel__1.06.01..") >>> sense.id 'oewn-carousel__1.06.01..' This module has four functions: 1. :func:`escape` transforms a sense key into a form that is valid for XML IDs. The *flavor* keyword argument specifies the escaping mechanism and it defaults to :python:`"oewn-v2"`. 2. :func:`unescape` transforms an escaped sense key back into the original form. The *flavor* keyword is the same as with :func:`escape`. 3. :func:`sense_key_getter` creates a function for retrieving the sense key for a given :class:`wn.Sense` object. Depending on the lexicon, it will retrieve the sense key from metadata or it will unescape the sense ID. 4. :func:`sense_getter` creates a function for retrieving a :class:`wn.Sense` object given a sense key. Depending on the lexicon, it will build and use a mapping of sense key metadata to :class:`wn.Sense` objects, or it will escape the sense key and use the escaped form as the ``id`` argument for :meth:`wn.Wordnet.sense`. .. seealso:: The documentation from the Princeton WordNet: https://wordnet.princeton.edu/documentation/senseidx5wn """ from collections.abc import Callable from typing import TypeAlias import wn from wn._util import split_lexicon_specifier SensekeyGetter: TypeAlias = Callable[[wn.Sense], str | None] SenseGetter: TypeAlias = Callable[[str], wn.Sense | None] METADATA_LEXICONS = { # OMW 1.4 "omw-en:1.4", "omw-en31:1.4", # OMW 2.0 "omw-en15:2.0", "omw-en16:2.0", "omw-en17:2.0", "omw-en171:2.0", "omw-en20:2.0", "omw-en21:2.0", "omw-en:2.0", "omw-en31:2.0", # EWN (OEWN) 2019, 2020 "ewn:2019", "ewn:2020", } SENSE_ID_LEXICONS = { # specifier:flavor "oewn:2021": "oewn", "oewn:2022": "oewn", "oewn:2023": "oewn", "oewn:2024": "oewn", "oewn:2025": "oewn-v2", "oewn:2025+": "oewn-v2", } OEWN_LEMMA_UNESCAPE_SEQUENCES = [ ("-ap-", "'"), ("-ex-", "!"), ("-cm-", ","), ("-cn-", ":"), ("-pl-", "+"), ("-sl-", "/"), ] OEWN_V2_LEMMA_UNESCAPE_SEQUENCES = [ ("-apos-", "'"), ("-colon-", ":"), ("-excl-", "!"), ("-num-", "#"), ("-dollar-", "$"), ("-percnt-", "%"), ("-amp-", "&"), ("-lpar-", "("), ("-rpar-", ")"), ("-ast-", "*"), ("-plus-", "+"), ("-comma-", ","), ("-sol-", "/"), ("-lbrace-", "{"), ("-vert-", "|"), ("-rbrace-", "}"), ("-tilde-", "~"), ("-cent-", "¢"), ("-pound-", "£"), ("-sect-", "§"), ("-copy-", "©"), ("-reg-", "®"), ("-deg-", "°"), ("-acute-", "´"), # noqa: RUF001 ("-para-", "¶"), ("-ordm-", "º"), ("--", "-"), ] def unescape(s: str, /, flavor: str = "oewn-v2") -> str: """Return the original form of an escaped sense key. The *flavor* argument specifies how the unescaping will be done. Its default value is :python:`"oewn-v2"`, which unescapes like the Open English Wordnet 2025 editions, including separate rules for the left and right side of the ``__`` delimiter. The other possible value is ``"oewn"``, which unescapes like the Open English Wordnet 2024 and prior editions. >>> from wn.compat import sensekey >>> sensekey.unescape("ceramic__3.01.00..") 'ceramic%3:01:00::' Note that this function does not remove any lexicon ID prefixes on sense IDs, so that may need to be done manually: >>> sensekey.unescape("oewn-ceramic__3.01.00..") 'oewn-ceramic%3:01:00::' >>> sensekey.unescape("oewn-ceramic__3.01.00..".removeprefix("oewn-")) 'ceramic%3:01:00::' """ match flavor: case "oewn": return _unescape_oewn(s, OEWN_LEMMA_UNESCAPE_SEQUENCES) case "oewn-v2": return _unescape_oewn(s, OEWN_V2_LEMMA_UNESCAPE_SEQUENCES) case _: raise ValueError(f"unsupported flavor: {flavor}") def _unescape_oewn(s: str, escape_sequences: list[tuple[str, str]]) -> str: lemma, _, rest = s.partition("__") for esc, char in escape_sequences: lemma = lemma.replace(esc, char) rest = rest.replace(".", ":").replace("-sp-", "_") if rest: return f"{lemma}%{rest}" else: return lemma def escape(sense_key: str, /, flavor: str = "oewn-v2") -> str: """Return an escaped sense key that is valid for XML IDs. The *flavor* argument specifies how the escaping will be done. Its default value is :python:`"oewn-v2"`, which escapes like the Open English Wordnet 2025 editions, including separate rules for the left and right side of the ``%`` delimiter. The other possible value is ``"oewn"``, which escapes like the Open English Wordnet 2024 and prior editions. >>> from wn.compat import sensekey >>> sensekey.escape("ceramic%3:01:00::") 'ceramic__3.01.00..' """ match flavor: case "oewn": return _escape_oewn(sense_key, OEWN_LEMMA_UNESCAPE_SEQUENCES) case "oewn-v2": return _escape_oewn(sense_key, OEWN_V2_LEMMA_UNESCAPE_SEQUENCES) case _: raise ValueError(f"unsupported flavor: {flavor}") def _escape_oewn(sense_key: str, escape_sequences: list[tuple[str, str]]) -> str: lemma, _, rest = sense_key.partition("%") for esc, char in reversed(escape_sequences): lemma = lemma.replace(char, esc) rest = rest.replace(":", ".").replace("_", "-sp-") if rest: return f"{lemma}__{rest}" else: return lemma def sense_key_getter(lexicon: str) -> SensekeyGetter: """Return a function that gets sense keys from senses. The *lexicon* argument determines how the function will retrieve the sense key; i.e., whether it is from the ``identifier`` metadata or unescaping the sense ID. For any unsupported lexicon, an error is raised. The function that is returned accepts one argument, a :class:`wn.Sense` (ideally from the same lexicon specified in the *lexicon* argument), and returns a :class:`str` if the sense key exists in the lexicon or :data:`None` otherwise. >>> import wn >>> from wn.compat import sensekey >>> oewn = wn.Wordnet("oewn:2024") >>> get_sense_key = sensekey.sense_key_getter("oewn:2024") >>> get_sense_key(oewn.senses("alabaster")[0]) 'alabaster%3:01:00::' When unescaping a sense ID, if the ID starts with its lexicon's ID and a hyphen (e.g., `"oewn-"`), it is assumed to be a conventional ID prefix and is removed prior to unescaping. """ if lexicon in METADATA_LEXICONS: def getter(sense: wn.Sense) -> str | None: return sense.metadata().get("identifier") elif lexicon in SENSE_ID_LEXICONS: flavor = SENSE_ID_LEXICONS[lexicon] lexid, _ = split_lexicon_specifier(lexicon) prefix = f"{lexid}-" def getter(sense: wn.Sense) -> str | None: sense_key = sense.id.removeprefix(prefix) # check if sense id is likely an escaped sense key if "__" in sense_key: return unescape(sense_key, flavor=flavor) return None else: raise wn.Error(f"no sense key getter is defined for {lexicon}") return getter def sense_getter(lexicon: str, wordnet: wn.Wordnet | None = None) -> SenseGetter: """Return a function that gets the sense for a sense key. The *lexicon* argument determines how the function will retrieve the sense; i.e., whether a mapping between a sense's ``identifier`` metadata and the sense will be created and used or the escaped sense key is used as the sense ID. For any unsupported lexicon, an error is raised. The optional *wordnet* object is used as the source of the returned :class:`wn.Sense` objects. If none is provided, a new :class:`wn.Wordnet` object is created using the *lexicon* argument. The function that is returned accepts one argument, a :class:`str` of the sense key, and returns a :class:`wn.Sense` if the sense key exists in the lexicon or :data:`None` otherwise. >>> import wn >>> from wn.compat import sensekey >>> get_sense = sensekey.sense_getter("oewn:2024") >>> get_sense("alabaster%3:01:00::") Sense('oewn-alabaster__3.01.00..') .. warning:: The mapping built for the ``omw-en*`` or ``ewn`` lexicons requires significant memory---around 100MiB---to use. The ``oewn`` lexicons do not require such a mapping and the memory usage is negligible. """ if wordnet is None: wordnet = wn.Wordnet(lexicon) if lexicon in METADATA_LEXICONS: get_sense_key = sense_key_getter(lexicon) sense_key_map = {get_sense_key(s): s.id for s in wordnet.senses()} if None in sense_key_map: sense_key_map.pop(None) # senses without sense keys def getter(sense_key: str) -> wn.Sense | None: if sense_id := sense_key_map.get(sense_key): return wordnet.sense(sense_id) return None elif lexicon in SENSE_ID_LEXICONS: flavor = SENSE_ID_LEXICONS[lexicon] lexid, _ = split_lexicon_specifier(lexicon) def getter(sense_key: str) -> wn.Sense | None: sense_id = f"{lexid}-{escape(sense_key, flavor=flavor)}" try: return wordnet.sense(sense_id) except wn.Error: return None else: raise wn.Error(f"no sense getter is defined for {lexicon}") return getter wn-1.0.0/wn/constants.py000066400000000000000000000207721513755206300151650ustar00rootroot00000000000000""" Constants and literals used in wordnets. """ SENSE_RELATIONS = frozenset( [ "antonym", "also", "participle", "pertainym", "derivation", "domain_topic", "has_domain_topic", "domain_region", "has_domain_region", "exemplifies", "is_exemplified_by", "similar", "other", "feminine", "has_feminine", "masculine", "has_masculine", "young", "has_young", "diminutive", "has_diminutive", "augmentative", "has_augmentative", "anto_gradable", "anto_simple", "anto_converse", "simple_aspect_ip", "secondary_aspect_ip", "simple_aspect_pi", "secondary_aspect_pi", "metaphor", "has_metaphor", "metonym", "has_metonym", "agent", "body_part", "by_means_of", "destination", "event", "instrument", "location", "material", "property", "result", "state", "undergoer", "uses", "vehicle", ] ) SENSE_SYNSET_RELATIONS = frozenset( [ "other", "domain_topic", "domain_region", "exemplifies", ] ) SYNSET_RELATIONS = frozenset( [ "agent", "also", "attribute", "be_in_state", "causes", "classified_by", "classifies", "co_agent_instrument", "co_agent_patient", "co_agent_result", "co_instrument_agent", "co_instrument_patient", "co_instrument_result", "co_patient_agent", "co_patient_instrument", "co_result_agent", "co_result_instrument", "co_role", "direction", "domain_region", "domain_topic", "exemplifies", "entails", "eq_synonym", "has_domain_region", "has_domain_topic", "is_exemplified_by", "holo_location", "holo_member", "holo_part", "holo_portion", "holo_substance", "holonym", "hypernym", "hyponym", "in_manner", "instance_hypernym", "instance_hyponym", "instrument", "involved", "involved_agent", "involved_direction", "involved_instrument", "involved_location", "involved_patient", "involved_result", "involved_source_direction", "involved_target_direction", "is_caused_by", "is_entailed_by", "location", "manner_of", "mero_location", "mero_member", "mero_part", "mero_portion", "mero_substance", "meronym", "similar", "other", "patient", "restricted_by", "restricts", "result", "role", "source_direction", "state_of", "target_direction", "subevent", "is_subevent_of", "antonym", "feminine", "has_feminine", "masculine", "has_masculine", "young", "has_young", "diminutive", "has_diminutive", "augmentative", "has_augmentative", "anto_gradable", "anto_simple", "anto_converse", "ir_synonym", ] ) REVERSE_RELATIONS = { "hypernym": "hyponym", "hyponym": "hypernym", "instance_hypernym": "instance_hyponym", "instance_hyponym": "instance_hypernym", "antonym": "antonym", "eq_synonym": "eq_synonym", "similar": "similar", "meronym": "holonym", "holonym": "meronym", "mero_location": "holo_location", "holo_location": "mero_location", "mero_member": "holo_member", "holo_member": "mero_member", "mero_part": "holo_part", "holo_part": "mero_part", "mero_portion": "holo_portion", "holo_portion": "mero_portion", "mero_substance": "holo_substance", "holo_substance": "mero_substance", # 'also': '', "state_of": "be_in_state", "be_in_state": "state_of", "causes": "is_caused_by", "is_caused_by": "causes", "subevent": "is_subevent_of", "is_subevent_of": "subevent", "manner_of": "in_manner", "in_manner": "manner_of", "attribute": "attribute", "restricts": "restricted_by", "restricted_by": "restricts", "classifies": "classified_by", "classified_by": "classifies", "entails": "is_entailed_by", "is_entailed_by": "entails", "domain_topic": "has_domain_topic", "has_domain_topic": "domain_topic", "domain_region": "has_domain_region", "has_domain_region": "domain_region", "exemplifies": "is_exemplified_by", "is_exemplified_by": "exemplifies", "role": "involved", "involved": "role", "agent": "involved_agent", "involved_agent": "agent", "patient": "involved_patient", "involved_patient": "patient", "result": "involved_result", "involved_result": "result", "instrument": "involved_instrument", "involved_instrument": "instrument", "location": "involved_location", "involved_location": "location", "direction": "involved_direction", "involved_direction": "direction", "target_direction": "involved_target_direction", "involved_target_direction": "target_direction", "source_direction": "involved_source_direction", "involved_source_direction": "source_direction", "co_role": "co_role", "co_agent_patient": "co_patient_agent", "co_patient_agent": "co_agent_patient", "co_agent_instrument": "co_instrument_agent", "co_instrument_agent": "co_agent_instrument", "co_agent_result": "co_result_agent", "co_result_agent": "co_agent_result", "co_patient_instrument": "co_instrument_patient", "co_instrument_patient": "co_patient_instrument", "co_result_instrument": "co_instrument_result", "co_instrument_result": "co_result_instrument", # 'pertainym': '', "derivation": "derivation", "simple_aspect_ip": "simple_aspect_pi", "simple_aspect_pi": "simple_aspect_ip", "secondary_aspect_ip": "secondary_aspect_pi", "secondary_aspect_pi": "secondary_aspect_ip", "feminine": "has_feminine", "has_feminine": "feminine", "masculine": "has_masculine", "has_masculine": "masculine", "young": "has_young", "has_young": "young", "diminutive": "has_diminutive", "has_diminutive": "diminutive", "augmentative": "has_augmentative", "has_augmentative": "augmentative", "anto_gradable": "anto_gradable", "anto_simple": "anto_simple", "anto_converse": "anto_converse", "ir_synonym": "ir_synonym", # 'participle': '', # 'other': '', "metaphor": "has_metaphor", "metonym": "has_metonym", } # Adjective Positions ADJPOSITIONS = frozenset( ( "a", # attributive "ip", # immediate postnominal "p", # predicative ) ) # Parts of Speech NOUN = "n" #: VERB = "v" #: ADJ = ADJECTIVE = "a" #: ADV = ADVERB = "r" #: ADJ_SAT = ADJECTIVE_SATELLITE = "s" #: PHRASE = "t" #: CONJ = CONJUNCTION = "c" #: ADP = ADPOSITION = "p" #: OTHER = "x" #: UNKNOWN = "u" #: PARTS_OF_SPEECH = frozenset( ( NOUN, VERB, ADJECTIVE, ADVERB, ADJECTIVE_SATELLITE, PHRASE, CONJUNCTION, ADPOSITION, OTHER, UNKNOWN, ) ) # Lexicographer Files # from https://wordnet.princeton.edu/documentation/lexnames5wn LEXICOGRAPHER_FILES = { "adj.all": 0, "adj.pert": 1, "adv.all": 2, "noun.Tops": 3, "noun.act": 4, "noun.animal": 5, "noun.artifact": 6, "noun.attribute": 7, "noun.body": 8, "noun.cognition": 9, "noun.communication": 10, "noun.event": 11, "noun.feeling": 12, "noun.food": 13, "noun.group": 14, "noun.location": 15, "noun.motive": 16, "noun.object": 17, "noun.person": 18, "noun.phenomenon": 19, "noun.plant": 20, "noun.possession": 21, "noun.process": 22, "noun.quantity": 23, "noun.relation": 24, "noun.shape": 25, "noun.state": 26, "noun.substance": 27, "noun.time": 28, "verb.body": 29, "verb.change": 30, "verb.cognition": 31, "verb.communication": 32, "verb.competition": 33, "verb.consumption": 34, "verb.contact": 35, "verb.creation": 36, "verb.emotion": 37, "verb.motion": 38, "verb.perception": 39, "verb.possession": 40, "verb.social": 41, "verb.stative": 42, "verb.weather": 43, "adj.ppl": 44, } # resource types _WORDNET = "wordnet" _ILI = "ili" wn-1.0.0/wn/ic.py000066400000000000000000000147671513755206300135530ustar00rootroot00000000000000"""Information Content is a corpus-based metrics of synset or sense specificity. """ from collections import Counter from collections.abc import Callable, Iterable, Iterator from math import log from pathlib import Path from typing import TextIO, TypeAlias from wn import Synset, Wordnet from wn._types import AnyPath from wn.constants import ADJ, ADJ_SAT, ADV, NOUN, VERB from wn.util import synset_id_formatter # Just use a subset of all available parts of speech IC_PARTS_OF_SPEECH = frozenset((NOUN, VERB, ADJ, ADV)) Freq: TypeAlias = dict[str, dict[str | None, float]] def information_content(synset: Synset, freq: Freq) -> float: """Calculate the Information Content value for a synset. The information content of a synset is the negative log of the synset probability (see :func:`synset_probability`). """ return -log(synset_probability(synset, freq)) def synset_probability(synset: Synset, freq: Freq) -> float: """Calculate the synset probability. The synset probability is defined as freq(ss)/N where freq(ss) is the IC weight for the synset and N is the total IC weight for all synsets with the same part of speech. Note: this function is not generally used directly, but indirectly through :func:`information_content`. """ pos_freq = freq[synset.pos] return pos_freq[synset.id] / pos_freq[None] def _initialize( wordnet: Wordnet, smoothing: float, ) -> Freq: """Populate an Information Content weight mapping to a smoothing value. All synsets in *wordnet* are inserted into the dictionary and mapped to *smoothing*. """ freq: Freq = { pos: {synset.id: smoothing for synset in wordnet.synsets(pos=pos)} for pos in IC_PARTS_OF_SPEECH } # pretend ADJ_SAT is just ADJ for synset in wordnet.synsets(pos=ADJ_SAT): freq[ADJ][synset.id] = smoothing # also initialize totals (when synset is None) for each part-of-speech for pos in IC_PARTS_OF_SPEECH: freq[pos][None] = smoothing return freq def compute( corpus: Iterable[str], wordnet: Wordnet, distribute_weight: bool = True, smoothing: float = 1.0, ) -> Freq: """Compute Information Content weights from a corpus. Arguments: corpus: An iterable of string tokens. This is a flat list of words and the order does not matter. Tokens may be single words or multiple words separated by a space. wordnet: An instantiated :class:`wn.Wordnet` object, used to look up synsets from words. distribute_weight: If :python:`True`, the counts for a word are divided evenly among all synsets for the word. smoothing: The initial value given to each synset. Example: >>> import wn, wn.ic, wn.morphy >>> ewn = wn.Wordnet("ewn:2020", lemmatizer=wn.morphy.morphy) >>> freq = wn.ic.compute(["Dogs", "run", ".", "Cats", "sleep", "."], ewn) >>> dog = ewn.synsets("dog", pos="n")[0] >>> cat = ewn.synsets("cat", pos="n")[0] >>> frog = ewn.synsets("frog", pos="n")[0] >>> freq["n"][dog.id] 1.125 >>> freq["n"][cat.id] 1.1 >>> freq["n"][frog.id] # no occurrence; smoothing value only 1.0 >>> carnivore = dog.lowest_common_hypernyms(cat)[0] >>> freq["n"][carnivore.id] 1.3250000000000002 """ freq = _initialize(wordnet, smoothing) counts = Counter(corpus) hypernym_cache: dict[Synset, list[Synset]] = {} for word, count in counts.items(): synsets = wordnet.synsets(word) num = len(synsets) if num == 0: continue weight = float(count / num if distribute_weight else count) for synset in synsets: pos = synset.pos if pos == ADJ_SAT: pos = ADJ if pos not in IC_PARTS_OF_SPEECH: continue freq[pos][None] += weight # The following while-loop is equivalent to: # # freq[pos][synset.id] += weight # for path in synset.hypernym_paths(): # for ss in path: # freq[pos][ss.id] += weight # # ...but it caches hypernym lookups for speed agenda: list[tuple[Synset, set[Synset]]] = [(synset, set())] while agenda: ss, seen = agenda.pop() # avoid cycles if ss in seen: continue freq[pos][ss.id] += weight if ss not in hypernym_cache: hypernym_cache[ss] = ss.hypernyms() agenda.extend((hyp, seen | {ss}) for hyp in hypernym_cache[ss]) return freq def load( source: AnyPath, wordnet: Wordnet, get_synset_id: Callable | None = None, ) -> Freq: """Load an Information Content mapping from a file. Arguments: source: A path to an information content weights file. wordnet: A :class:`wn.Wordnet` instance with synset identifiers matching the offsets in the weights file. get_synset_id: A callable that takes a synset offset and part of speech and returns a synset ID valid in *wordnet*. Raises: :class:`wn.Error`: If *wordnet* does not have exactly one lexicon. Example: >>> import wn, wn.ic >>> pwn = wn.Wordnet("pwn:3.0") >>> path = "~/nltk_data/corpora/wordnet_ic/ic-brown-resnik-add1.dat" >>> freq = wn.ic.load(path, pwn) """ source = Path(source).expanduser().resolve(strict=True) assert len(wordnet.lexicons()) == 1 lexid = wordnet.lexicons()[0].id if get_synset_id is None: get_synset_id = synset_id_formatter(prefix=lexid) freq = _initialize(wordnet, 0.0) with source.open() as icfile: for offset, pos, weight, is_root in _parse_ic_file(icfile): ssid = get_synset_id(offset=offset, pos=pos) # synset = wordnet.synset(ssid) freq[pos][ssid] = weight if is_root: freq[pos][None] += weight return freq def _parse_ic_file(icfile: TextIO) -> Iterator[tuple[int, str, float, bool]]: """Parse the Information Content file. A sample of the format is:: wnver::eOS9lXC6GvMWznF1wkZofDdtbBU 1740n 1915712 ROOT 1930n 859272 2137n 1055337 """ next(icfile) # skip header for line in icfile: ssinfo, value, *isroot = line.split() yield (int(ssinfo[:-1]), ssinfo[-1], float(value), bool(isroot)) wn-1.0.0/wn/ili.py000066400000000000000000000227151513755206300137250ustar00rootroot00000000000000"""Interlingual Indices This module provides classes and functions for inspecting Interlingual Index (ILI) objects, both existing and proposed and including their definitions and any metadata, for synsets and lexicons. """ from __future__ import annotations from dataclasses import dataclass, field from enum import Enum from itertools import zip_longest from pathlib import Path from typing import TYPE_CHECKING, Literal, Protocol, overload from wn._lexicon import Lexicon, LexiconElementWithMetadata from wn._metadata import HasMetadata from wn._queries import ( find_ilis, find_proposed_ilis, get_ili, ) from wn._wordnet import Wordnet if TYPE_CHECKING: from collections.abc import Iterator from wn._core import Synset from wn._metadata import Metadata from wn._types import AnyPath class ILIStatus(str, Enum): __module__ = "wn" UNKNOWN = "unknown" # no information available ACTIVE = "active" # attested in ILI file and marked as active PRESUPPOSED = "presupposed" # used by lexicon, ILI file not loaded PROPOSED = "proposed" # proposed by lexicon for addition to ILI @dataclass(slots=True) class ILIDefinition(HasMetadata): """Class for modeling ILI definitions.""" __module__ = "wn" text: str _metadata: Metadata | None = field(default=None, compare=False, repr=False) _lexicon: str | None = field(default=None, compare=False, repr=False) def metadata(self) -> Metadata: """Return the ILI's metadata.""" return self._metadata if self._metadata is not None else {} def confidence(self) -> float: c = self.metadata().get("confidenceScore") if c is None: if self._lexicon: # ProposedILIs are lexicon elements and inherit their # lexicon's confidence value c = Lexicon.from_specifier(self._lexicon).confidence() else: # Regular ILIs are not lexicon elements c = 1.0 return float(c) class ILIProtocol(Protocol): _definition_text: str | None _definition_metadata: Metadata | None @property def id(self) -> str | None: """The ILI identifier.""" ... @property def status(self) -> ILIStatus: """The status of the ILI.""" ... @overload def definition(self, *, data: Literal[False] = False) -> str | None: ... @overload def definition(self, *, data: Literal[True] = True) -> ILIDefinition | None: ... # fallback for non-literal bool argument @overload def definition(self, *, data: bool) -> str | ILIDefinition | None: ... def definition(self, *, data: bool = False) -> str | ILIDefinition | None: """Return the ILI's definition. If the *data* argument is :python:`False` (the default), the definition is returned as a :class:`str` type. If it is :python:`True`, a :class:`wn.ILIDefinition` object is used instead. Note that :class:`ILI` objects will not have definitions unless an ILI resource has been added, but :class:`ProposedILI` objects will have definitions if one is provided by the proposing lexicon. """ if data and self._definition_text: return ILIDefinition( self._definition_text, _metadata=self._definition_metadata, # lexicon is defined only for proposed ILIs _lexicon=getattr(self, "_lexicon", None), ) return self._definition_text @dataclass(frozen=True, slots=True) class ILI(ILIProtocol): """A class for interlingual indices.""" __module__ = "wn" id: str status: ILIStatus = field( default=ILIStatus.UNKNOWN, repr=False, hash=False, compare=False ) _definition_text: str | None = field( default=None, repr=False, hash=False, compare=False ) _definition_metadata: Metadata | None = field( default=None, repr=False, hash=False, compare=False ) @dataclass(frozen=True, slots=True) class ProposedILI(LexiconElementWithMetadata, ILIProtocol): __module__ = "wn" _synset: str _lexicon: str _definition_text: str | None = field( default=None, repr=False, hash=False, compare=False ) _definition_metadata: Metadata | None = field( default=None, repr=False, hash=False, compare=False ) @property def id(self) -> Literal[None]: """Always return :python:`None`. Proposed ILIs do not have identifiers. This method is kept for interface consistency. """ return None @property def status(self) -> Literal[ILIStatus.PROPOSED]: """Always return :attr:`ILIStatus.PROPOSED`. Proposed ILI objects are only used for ILIs that are proposed. """ return ILIStatus.PROPOSED def synset(self) -> Synset: """Return the synset object associated with the proposed ILI.""" return Wordnet(self._lexicon).synset(self._synset) def get(id: str) -> ILI | None: """Get the ILI object with the given id. The *id* argument is a string ILI identifier. If *id* does not match a known ILI, :python:`None` is returned. Note that a :python:`None` value does not necessarily mean that there is no such ILI, but rather that no resource declaring that ILI has been loaded into Wn's database. Example: >>> from wn import ili >>> ili.get("i12345") ILI('i12345') >>> ili.get("i0") is None True """ if row := get_ili(id=id): id, status, defn, meta = row return ILI( id, status=ILIStatus(status), _definition_text=defn, _definition_metadata=meta, ) return None def get_all( *, status: ILIStatus | str | None = None, lexicon: str | None = None, ) -> list[ILI]: """Get the list of all matching ILI objects. The *status* argument may be a string matching a single :class:`ILIStatus`, or a union of one or more :class:`ILIStatus` values. The *lexicon* argument is a space-separated string of lexicon specifiers. All ILIs with a matching status and lexicon will be returned. Example: >>> from wn import ili >>> len(ili.get_all()) 117442 """ if isinstance(status, str): status = ILIStatus(status) lexicons = lexicon.split() if lexicon else [] return [ ILI( id, status=ILIStatus(status), _definition_text=defn, _definition_metadata=meta, ) for id, status, defn, meta in find_ilis(status=status, lexicons=lexicons) ] def get_proposed(synset: Synset) -> ProposedILI | None: """Get a proposed ILI for *synset* if it exists. The synset itself does not give a good indication if it has an associated proposed ILI. The :attr:`wn.Synset.ili` value will be :python:`None`, but this is also true if there is no ILI at all. In most cases it is easier to list the proposed ILIs for a lexicon using :func:`get_all_proposed`, then to retrieve their associated synsets. Example: >>> import wn >>> from wn import ili >>> en = wn.Wordnet("oewn:2024") >>> en.synset("oewn-00002935-r").ili is None True >>> ili.get_proposed(en.synset("oewn-00002935-r")) ProposedILI(_synset='oewn-00002935-r', _lexicon='oewn:2024') """ results = find_proposed_ilis( synset_id=synset.id, lexicons=(synset.lexicon().specifier(),), ) if row := next(results, None): return ProposedILI(*row) return None def get_all_proposed(lexicon: str | None = None) -> list[ProposedILI]: """Get the list of all proposed ILI objects. The *lexicon* argument is a space-separated string of lexicon specifiers. Proposed ILIs matching the lexicon will be returned. Example: >>> from wn import ili >>> proposed = ili.get_all_proposed("oewn:2024") >>> proposed[0] ProposedILI(_synset='oewn-00002935-r', _lexicon='oewn:2024') >>> proposed[0].synset() Synset('oewn-00002935-r') """ lexicons = lexicon.split() if lexicon else [] return [ProposedILI(*row) for row in find_proposed_ilis(lexicons=lexicons)] def is_ili_tsv(source: AnyPath) -> bool: """Return True if *source* is an ILI tab-separated-value file. This only checks that the first column, split by tabs, of the first line is 'ili' or 'ILI'. It does not check if each line has the correct number of columns. """ source = Path(source).expanduser() if source.is_file(): try: with source.open("rb") as fh: return next(fh).split(b"\t")[0] in (b"ili", b"ILI") except (StopIteration, IndexError): pass return False def load_tsv(source: AnyPath) -> Iterator[dict[str, str]]: """Yield data from an ILI tab-separated-value file. This function yields dictionaries mapping field names to values. The *source* argument is a path to an ILI file. Example: >>> from wn import ili >>> obj = next(ili._load_tsv("cili.tsv")) >>> obj.keys() dict_keys(['ili', 'definition']) >>> obj["ili"] 'i1' """ source = Path(source).expanduser() with source.open(encoding="utf-8") as fh: header = next(fh).rstrip("\r\n") fields = tuple(map(str.lower, header.split("\t"))) for line in fh: yield dict( zip_longest( fields, line.rstrip("\r\n").split("\t"), fillvalue="", ) ) wn-1.0.0/wn/index.toml000066400000000000000000000510361513755206300146000ustar00rootroot00000000000000[cili] type = "ili" label = "Collaborative Interlingual Index" license = "https://creativecommons.org/licenses/by/4.0/" [cili.versions."1.0"] url = "https://github.com/globalwordnet/cili/releases/download/v1.0/cili.tsv.xz" [oewn] label = "Open English WordNet" language = "en" license = "https://creativecommons.org/licenses/by/4.0/" [oewn.versions."2025+"] url = "https://en-word.net/static/english-wordnet-2025-plus.xml.gz" [oewn.versions.2025] url = "https://en-word.net/static/english-wordnet-2025.xml.gz" [oewn.versions.2024] url = """ https://en-word.net/static/english-wordnet-2024.xml.gz https://github.com/globalwordnet/english-wordnet/releases/download/2024-edition/english-wordnet-2024.xml.gz """ [oewn.versions.2023] url = """ https://en-word.net/static/english-wordnet-2023.xml.gz https://github.com/globalwordnet/english-wordnet/releases/download/2023-edition/english-wordnet-2023.xml.gz """ [oewn.versions.2022] url = """ https://en-word.net/static/english-wordnet-2022.xml.gz https://github.com/globalwordnet/english-wordnet/releases/download/2022-edition/english-wordnet-2022.xml.gz """ [oewn.versions.2021] url = "https://en-word.net/static/english-wordnet-2021.xml.gz" [oewn.versions.2020] error = "Use 'ewn' as the ID prior to version 2021 ('ewn:2020')" [oewn.versions.2019] error = "Use 'ewn' as the ID prior to version 2021 ('ewn:2019')" [ewn] label = "Open English WordNet" language = "en" license = "https://creativecommons.org/licenses/by/4.0/" [ewn.versions.2021] error = "Use 'oewn' as the ID from version 2021 ('oewn:2021')" [ewn.versions.2020] url = "https://en-word.net/static/english-wordnet-2020.xml.gz" [ewn.versions.2019] url = "https://en-word.net/static/english-wordnet-2019.xml.gz" [odenet] label = "Open German WordNet" language = "de" license = "https://creativecommons.org/licenses/by-sa/4.0/" [odenet.versions."1.4"] url = "https://github.com/hdaSprachtechnologie/odenet/releases/download/v1.4/odenet-1.4.tar.xz" [odenet.versions."1.3"] url = "https://github.com/hdaSprachtechnologie/odenet/releases/download/v1.3/odenet-1.3.tar.xz" [omw] label = "Open Multilingual Wordnet" language = "mul" license = "Please consult the LICENSE files included with the individual wordnets. Note that all permit redistribution." [omw.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-2.0.tar.xz" [omw.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-1.4.tar.xz" [omw.versions."1.3"] error = "OMW 1.3 is no longer indexed; See https://github.com/goodmami/wn#changes-to-the-index" [omw-en] label = "OMW English Wordnet based on WordNet 3.0" language = "en" license = "https://wordnet.princeton.edu/license-and-commercial-use" [omw-en.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-en-2.0.tar.xz" [omw-en.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-en-1.4.tar.xz" [omw-en15] label = "OMW English Wordnet based on WordNet-1.5" language = "en" license = "WordNet-1.5 License" [omw-en15.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-en15-2.0.tar.xz" [omw-en16] label = "OMW English Wordnet based on WordNet-1.6" language = "en" license = "WordNet-1.6 License" [omw-en16.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-en16-2.0.tar.xz" [omw-en17] label = "OMW English Wordnet based on WordNet-1.7" language = "en" license = "https://wordnetcode.princeton.edu/1.7/LICENSE" [omw-en17.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-en17-2.0.tar.xz" [omw-en171] label = "OMW English Wordnet based on WordNet-1.7.1" language = "en" license = "https://wordnetcode.princeton.edu/1.7.1/LICENSE" [omw-en171.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-en171-2.0.tar.xz" [omw-en20] label = "OMW English Wordnet based on WordNet-2.0" language = "en" license = "https://wordnetcode.princeton.edu/2.0/LICENSE" [omw-en20.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-en20-2.0.tar.xz" [omw-en21] label = "OMW English Wordnet based on WordNet-2.1" language = "en" license = "https://wordnetcode.princeton.edu/2.1/LICENSE" [omw-en21.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-en21-2.0.tar.xz" [omw-en31] label = "OMW English Wordnet based on WordNet 3.1" language = "en" license = "https://wordnet.princeton.edu/license-and-commercial-use" [omw-en31.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-en31-2.0.tar.xz" [omw-en31.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-en31-1.4.tar.xz" [omw-arb] label = "Arabic WordNet (AWN v2)" language = "arb" license = "https://creativecommons.org/licenses/by-sa/3.0/" [omw-arb.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-arb-2.0.tar.xz" [omw-arb.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-arb-1.4.tar.xz" [omw-bg] label = "BulTreeBank Wordnet (BTB-WN)" language = "bg" license = "https://creativecommons.org/licenses/by/3.0/" [omw-bg.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-bg-2.0.tar.xz" [omw-bg.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-bg-1.4.tar.xz" [omw-ca] label = "Multilingual Central Repository (Catalan)" language = "ca" license = "https://creativecommons.org/licenses/by/3.0/" [omw-ca.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-ca-2.0.tar.xz" [omw-ca.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-ca-1.4.tar.xz" [omw-cmn] label = "Chinese Open Wordnet" language = "cmn-Hans" license = "wordnet" [omw-cmn.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-cmn-2.0.tar.xz" [omw-cmn.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-cmn-1.4.tar.xz" [omw-da] label = "DanNet" language = "da" license = "wordnet" [omw-da.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-da-2.0.tar.xz" [omw-da.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-da-1.4.tar.xz" [omw-el] label = "Greek Wordnet" language = "el" license = "https://opensource.org/licenses/Apache-2.0" [omw-el.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-el-2.0.tar.xz" [omw-el.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-el-1.4.tar.xz" [omw-es] label = "Multilingual Central Repository (Spanish)" language = "es" license = "https://creativecommons.org/licenses/by/3.0/" [omw-es.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-es-2.0.tar.xz" [omw-es.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-es-1.4.tar.xz" [omw-eu] label = "Multilingual Central Repository (Basque)" language = "eu" license = "https://creativecommons.org/licenses/by/3.0/" [omw-eu.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-eu-2.0.tar.xz" [omw-eu.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-eu-1.4.tar.xz" [omw-fi] label = "FinnWordNet" language = "fi" license = "https://creativecommons.org/licenses/by/3.0/" [omw-fi.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-fi-2.0.tar.xz" [omw-fi.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-fi-1.4.tar.xz" [omw-fr] label = "WOLF (Wordnet Libre du Français)" language = "fr" license = "http://www.cecill.info/licenses/Licence_CeCILL-C_V1-en.html" [omw-fr.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-fr-2.0.tar.xz" [omw-fr.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-fr-1.4.tar.xz" [omw-gl] label = "Multilingual Central Repository (Galician)" language = "gl" license = "https://creativecommons.org/licenses/by/3.0/" [omw-gl.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-gl-2.0.tar.xz" [omw-gl.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-gl-1.4.tar.xz" [omw-he] label = "Hebrew Wordnet" language = "he" license = "wordnet" [omw-he.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-he-2.0.tar.xz" [omw-he.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-he-1.4.tar.xz" [omw-hr] label = "Croatian Wordnet" language = "hr" license = "https://creativecommons.org/licenses/by/3.0/" [omw-hr.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-hr-2.0.tar.xz" [omw-hr.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-hr-1.4.tar.xz" [omw-id] label = "Wordnet Bahasa (Indonesian)" language = "id" license = "https://opensource.org/licenses/MIT/" [omw-id.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-id-2.0.tar.xz" [omw-id.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-id-1.4.tar.xz" [omw-is] label = "IceWordNet" language = "is" license = "https://creativecommons.org/licenses/by/3.0/" [omw-is.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-is-2.0.tar.xz" [omw-is.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-is-1.4.tar.xz" [omw-it] label = "MultiWordNet (Italian)" language = "it" license = "https://creativecommons.org/licenses/by/3.0/" [omw-it.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-it-2.0.tar.xz" [omw-it.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-it-1.4.tar.xz" [omw-iwn] label = "ItalWordNet" language = "it" license = "http://opendefinition.org/licenses/odc-by/" [omw-iwn.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-iwn-2.0.tar.xz" [omw-iwn.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-iwn-1.4.tar.xz" [omw-ja] label = "Japanese Wordnet" language = "ja" license = "wordnet" [omw-ja.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-ja-2.0.tar.xz" [omw-ja.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-ja-1.4.tar.xz" [omw-lt] label = "Lithuanian WordNet" language = "lt" license = "https://creativecommons.org/licenses/by-sa/3.0/" [omw-lt.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-lt-2.0.tar.xz" [omw-lt.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-lt-1.4.tar.xz" [omw-nb] label = "Norwegian Wordnet (Bokmål)" language = "nb" license = "wordnet" [omw-nb.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-nb-2.0.tar.xz" [omw-nb.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-nb-1.4.tar.xz" [omw-nl] label = "Open Dutch WordNet" language = "nl" license = "https://creativecommons.org/licenses/by-sa/4.0/" [omw-nl.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-nl-2.0.tar.xz" [omw-nl.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-nl-1.4.tar.xz" [omw-nn] label = "Norwegian Wordnet (Nynorsk)" language = "nn" license = "wordnet" [omw-nn.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-nn-2.0.tar.xz" [omw-nn.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-nn-1.4.tar.xz" [omw-pl] label = "plWordNet" language = "pl" license = "wordnet" [omw-pl.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-pl-2.0.tar.xz" [omw-pl.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-pl-1.4.tar.xz" [omw-pt] label = "OpenWN-PT" language = "pt" license = "https://creativecommons.org/licenses/by-sa/" [omw-pt.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-pt-2.0.tar.xz" [omw-pt.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-pt-1.4.tar.xz" [omw-ro] label = "Romanian Wordnet" language = "ro" license = "https://creativecommons.org/licenses/by-sa/" [omw-ro.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-ro-2.0.tar.xz" [omw-ro.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-ro-1.4.tar.xz" [omw-sk] label = "Slovak WordNet" language = "sk" license = "https://creativecommons.org/licenses/by-sa/3.0/" [omw-sk.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-sk-2.0.tar.xz" [omw-sk.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-sk-1.4.tar.xz" [omw-sl] label = "sloWNet" language = "sl" license = "https://creativecommons.org/licenses/by-sa/3.0/" [omw-sl.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-sl-2.0.tar.xz" [omw-sl.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-sl-1.4.tar.xz" [omw-sq] label = "Albanet" language = "sq" license = "https://creativecommons.org/licenses/by/3.0/" [omw-sq.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-sq-2.0.tar.xz" [omw-sq.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-sq-1.4.tar.xz" [omw-sv] label = "WordNet-SALDO" language = "sv" license = "https://creativecommons.org/licenses/by/3.0/" [omw-sv.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-sv-2.0.tar.xz" [omw-sv.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-sv-1.4.tar.xz" [omw-th] label = "Thai Wordnet" language = "th" license = "wordnet" [omw-th.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-th-2.0.tar.xz" [omw-th.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-th-1.4.tar.xz" [omw-zsm] label = "Wordnet Bahasa (Malaysian)" language = "zsm" license = "https://opensource.org/licenses/MIT/" [omw-zsm.versions."2.0"] url = "https://github.com/omwn/omw-data/releases/download/v2.0/omw-zsm-2.0.tar.xz" [omw-zsm.versions."1.4"] url = "https://github.com/omwn/omw-data/releases/download/v1.4/omw-zsm-1.4.tar.xz" [own] label = "Open Wordnets for Portuguese and English" language = "mul" license = "Please consult the LICENSE files." [own.versions."1.0.0"] url = "https://github.com/own-pt/openWordnet-PT/releases/download/v1.0.0/own.tar.gz" [own-en] label = "Open Wordnet for English" language = "en" license = "Please consult the LICENSE files." [own-en.versions."1.0.0"] url = "https://github.com/own-pt/openWordnet-PT/releases/download/v1.0.0/own-en.tar.gz" [own-pt] label = "Open Wordnet for Portuguese" language = "pt" license = "Please consult the LICENSE files." [own-pt.versions."1.0.0"] url = "https://github.com/own-pt/openWordnet-PT/releases/download/v1.0.0/own-pt.tar.gz" [kurdnet] label = "KurdNet (Kurdish WordNet)" language = "ckb" license = "https://creativecommons.org/licenses/by-sa/4.0/" [kurdnet.versions."1.0"] url = "https://github.com/sinaahmadi/kurdnet/releases/download/kurdnet-1.0.tar.xz/kurdnet-1.0.tar.xz" # Delisted wordnets [pwn] [pwn.versions."3.0"] error = "'pwn:3.0' is no longer indexed; use 'omw-en:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [pwn.versions."3.1"] error = "'pwn:3.1' is no longer indexed; use 'omw-en31:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [alswn] error = "'alswn:1.3+omw' is no longer indexed; use 'omw-sq:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [arbwn] error = "'arbwn:1.3+omw' is no longer indexed; use 'omw-arb:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [bulwn] error = "'bulwn:1.3+omw' is no longer indexed; use 'omw-bg:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [catwn] error = "'catwn:1.3+omw' is no longer indexed; use 'omw-ca:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [cmnwn] error = "'cmnwn:1.3+omw' is no longer indexed; use 'omw-cmn:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [danwn] error = "'danwn:1.3+omw' is no longer indexed; use 'omw-da:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [ellwn] error = "'ellwn:1.3+omw' is no longer indexed; use 'omw-el:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [euswn] error = "'euswn:1.3+omw' is no longer indexed; use 'omw-eu:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [finwn] error = "'finwn:1.3+omw' is no longer indexed; use 'omw-fi:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [frawn] error = "'frawn:1.3+omw' is no longer indexed; use 'omw-fr:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [glgwn] error = "'glgwn:1.3+omw' is no longer indexed; use 'omw-gl:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [hebwn] error = "'hebwn:1.3+omw' is no longer indexed; use 'omw-he:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [hrvwn] error = "'hrvwn:1.3+omw' is no longer indexed; use 'omw-hr:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [indwn] error = "'indwn:1.3+omw' is no longer indexed; use 'omw-id:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [islwn] error = "'islwn:1.3+omw' is no longer indexed; use 'omw-is:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [itawn] error = "'itawn:1.3+omw' is no longer indexed; use 'omw-it:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [iwn] error = "'iwn:1.3+omw' is no longer indexed; use 'omw-iwn:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [jpnwn] error = "'jpnwn:1.3+omw' is no longer indexed; use 'omw-ja:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [litwn] error = "'litwn:1.3+omw' is no longer indexed; use 'omw-lt:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [nldwn] error = "'nldwn:1.3+omw' is no longer indexed; use 'omw-nl:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [nnown] error = "'nnown:1.3+omw' is no longer indexed; use 'omw-nn:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [nobwn] error = "'nobwn:1.3+omw' is no longer indexed; use 'omw-nb:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [polwn] error = "'polwn:1.3+omw' is no longer indexed; use 'omw-pl:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [porwn] error = "'porwn:1.3+omw' is no longer indexed; use 'omw-pt:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [ronwn] error = "'ronwn:1.3+omw' is no longer indexed; use 'omw-ro:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [slkwn] error = "'slkwn:1.3+omw' is no longer indexed; use 'omw-sk:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [slvwn] error = "'slvwn:1.3+omw' is no longer indexed; use 'omw-sl:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [spawn] error = "'spawn:1.3+omw' is no longer indexed; use 'omw-es:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [swewn] error = "'swewn:1.3+omw' is no longer indexed; use 'omw-sv:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [thawn] error = "'thawn:1.3+omw' is no longer indexed; use 'omw-th:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" [zsmwn] error = "'zsmwn:1.3+omw' is no longer indexed; use 'omw-zsm:1.4' instead (https://github.com/goodmami/wn#changes-to-the-index)" wn-1.0.0/wn/lmf.py000066400000000000000000000752141513755206300137300ustar00rootroot00000000000000""" Reader for the Lexical Markup Framework (LMF) format. """ import re import xml.etree.ElementTree as ET # for general XML parsing import xml.parsers.expat # for fast scanning of Lexicon versions from pathlib import Path from typing import Any, BinaryIO, Literal, TextIO, TypedDict, cast from xml.sax.saxutils import quoteattr from wn._exceptions import Error from wn._metadata import Metadata from wn._types import AnyPath, VersionInfo from wn._util import is_xml, version_info from wn.util import ProgressBar, ProgressHandler class LMFError(Error): """Raised on invalid LMF-XML documents.""" class LMFWarning(Warning): """Issued on non-conforming LFM values.""" SUPPORTED_VERSIONS = {"1.0", "1.1", "1.2", "1.3", "1.4"} _XMLDECL = b'' _XMLSPACEATTR = "http://www.w3.org/XML/1998/namespace space" # xml:space _DOCTYPE = '' _SCHEMAS = { "1.0": "https://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd", "1.1": "https://globalwordnet.github.io/schemas/WN-LMF-1.1.dtd", "1.2": "https://globalwordnet.github.io/schemas/WN-LMF-1.2.dtd", "1.3": "https://globalwordnet.github.io/schemas/WN-LMF-1.3.dtd", "1.4": "https://globalwordnet.github.io/schemas/WN-LMF-1.4.dtd", } _DOCTYPES = { _DOCTYPE.format(schema=schema): version for version, schema in _SCHEMAS.items() } _DOCTYPES.update( (_DOCTYPE.format(schema=schema.replace("https://", "http://")), version) for version, schema in _SCHEMAS.items() ) _DC_URIS = { "1.0": "http://purl.org/dc/elements/1.1/", "1.1": "https://globalwordnet.github.io/schemas/dc/", "1.2": "https://globalwordnet.github.io/schemas/dc/", "1.3": "https://globalwordnet.github.io/schemas/dc/", "1.4": "https://globalwordnet.github.io/schemas/dc/", } _DC_ATTRS = [ "contributor", "coverage", "creator", "date", "description", "format", "identifier", "publisher", "relation", "rights", "source", "subject", "title", "type", ] _NS_ATTRS = { version: dict( [(f"{uri} {attr}", attr) for attr in _DC_ATTRS] + [ ("status", "status"), ("note", "note"), ("confidenceScore", "confidenceScore"), ] ) for version, uri in _DC_URIS.items() } _LMF_1_0_ELEMS: dict[str, str] = { "LexicalResource": "lexical-resource", "Lexicon": "lexicons", "LexicalEntry": "entries", "Lemma": "lemma", "Form": "forms", "Tag": "tags", "Sense": "senses", "SenseRelation": "relations", "Example": "examples", "Count": "counts", "SyntacticBehaviour": "frames", "Synset": "synsets", "Definition": "definitions", "ILIDefinition": "ili_definition", "SynsetRelation": "relations", } _LMF_1_1_ELEMS = dict(_LMF_1_0_ELEMS) _LMF_1_1_ELEMS.update( { "Requires": "requires", "Extends": "extends", "Pronunciation": "pronunciations", "LexiconExtension": "lexicons", "ExternalLexicalEntry": "entries", "ExternalLemma": "lemma", "ExternalForm": "forms", "ExternalSense": "senses", "ExternalSynset": "synsets", } ) _VALID_ELEMS = { "1.0": _LMF_1_0_ELEMS, "1.1": _LMF_1_1_ELEMS, "1.2": _LMF_1_1_ELEMS, # no new elements "1.3": _LMF_1_1_ELEMS, # no new elements "1.4": _LMF_1_1_ELEMS, # no new elements } _LIST_ELEMS = { # elements that collect into lists "Lexicon", "LexicalEntry", "Form", "Pronunciation", "Tag", "Sense", "SenseRelation", "Example", "Count", "Synset", "Definition", "SynsetRelation", "SyntacticBehaviour", "LexiconExtension", "Requires", "ExternalLexicalEntry", "ExternalForm", "ExternalSense", "ExternalSynset", } _CDATA_ELEMS = { # elements with inner text "Pronunciation", "Tag", "Definition", "ILIDefinition", "Example", "Count", } _META_ELEMS = { # elements with metadata "Lexicon", "LexicalEntry", "Sense", "SenseRelation", "Example", "Count", "Synset", "Definition", "ILIDefinition", "SynsetRelation", "LexiconExtension", } # WN-LMF Modeling ###################################################### # WN-LMF type-checking is handled via TypedDicts. Inheritance and # `total=False` are used to model optionality. For more information # about this tactic, see https://www.python.org/dev/peps/pep-0589/. # From Python 3.11, we can use typing.Required / typing.NotRequired. class _HasId(TypedDict): id: str class _HasILI(TypedDict): ili: str class _HasSynset(TypedDict): synset: str class _MaybeId(TypedDict, total=False): id: str class _HasText(TypedDict): text: str class _MaybeScript(TypedDict, total=False): script: str class _HasMeta(TypedDict, total=False): meta: Metadata | None class _External(TypedDict): external: Literal[True] class ILIDefinition(_HasText, _HasMeta): ... class Definition(_HasText, _HasMeta, total=False): language: str sourceSense: str class Relation(_HasMeta): target: str relType: str class Example(_HasText, _HasMeta, total=False): language: str class Synset(_HasId, _HasILI, _HasMeta, total=False): ili_definition: ILIDefinition partOfSpeech: str definitions: list[Definition] relations: list[Relation] examples: list[Example] lexicalized: bool members: list[str] lexfile: str class ExternalSynset(_HasId, _External, total=False): definitions: list[Definition] relations: list[Relation] examples: list[Example] class Count(_HasMeta): value: int class Sense(_HasId, _HasSynset, _HasMeta, total=False): relations: list[Relation] examples: list[Example] counts: list[Count] n: int lexicalized: bool adjposition: str subcat: list[str] class ExternalSense(_HasId, _External, total=False): relations: list[Relation] examples: list[Example] counts: list[Count] class Pronunciation(_HasText, total=False): variety: str notation: str phonemic: bool audio: str class Tag(_HasText): category: str class _FormChildren(TypedDict, total=False): pronunciations: list[Pronunciation] tags: list[Tag] class Lemma(_MaybeScript, _FormChildren): writtenForm: str partOfSpeech: str class ExternalLemma(_FormChildren, _External): ... class Form(_MaybeId, _MaybeScript, _FormChildren): writtenForm: str class ExternalForm(_HasId, _FormChildren, _External): ... class _SyntacticBehaviourBase(_MaybeId): subcategorizationFrame: str class SyntacticBehaviour(_SyntacticBehaviourBase, total=False): senses: list[str] class _LexicalEntryBase(_HasId, _HasMeta, total=False): index: str forms: list[Form] senses: list[Sense] frames: list[SyntacticBehaviour] class LexicalEntry(_LexicalEntryBase): lemma: Lemma class ExternalLexicalEntry(_HasId, _External, total=False): lemma: ExternalLemma | None forms: list[Form | ExternalForm] senses: list[Sense | ExternalSense] class LexiconSpecifier(_HasId): # public but not an LMF entry version: str class Dependency(LexiconSpecifier, total=False): url: str | None class _LexiconRequired(LexiconSpecifier, _HasMeta): label: str language: str email: str license: str class _LexiconBase(_LexiconRequired, total=False): url: str citation: str logo: str class Lexicon(_LexiconBase, total=False): requires: list[Dependency] entries: list[LexicalEntry] synsets: list[Synset] frames: list[SyntacticBehaviour] class _LexiconExtensionBase(_LexiconBase): extends: Dependency class LexiconExtension(_LexiconExtensionBase, total=False): requires: list[Dependency] entries: list[LexicalEntry | ExternalLexicalEntry] synsets: list[Synset | ExternalSynset] frames: list[SyntacticBehaviour] class LexicalResource(TypedDict): lmf_version: str lexicons: list[Lexicon | LexiconExtension] # Reading ############################################################## def is_lmf(source: AnyPath) -> bool: """Return True if *source* is a WN-LMF file.""" source = Path(source).expanduser() if not is_xml(source): return False with source.open(mode="rb") as fh: try: _read_header(fh) except LMFError: return False return True def _read_header(fh: BinaryIO) -> str: xmldecl = fh.readline().rstrip().replace(b"'", b'"') doctype = fh.readline().rstrip().replace(b"'", b'"') if xmldecl != _XMLDECL: raise LMFError("invalid or missing XML declaration") # the XML declaration states that the file is UTF-8 (other # encodings are not allowed) doctype_decoded = doctype.decode("utf-8") if doctype_decoded not in _DOCTYPES: raise LMFError("invalid or missing DOCTYPE declaration") return _DOCTYPES[doctype_decoded] class ScanInfo(LexiconSpecifier): label: str | None extends: LexiconSpecifier | None def scan_lexicons(source: AnyPath) -> list[ScanInfo]: """Scan *source* and return only the top-level lexicon info. The returned info is a dictionary containing the `id`, `version`, and `label` attributes from a lexicon. If the Lexicon is an extension, an `extends` key maps to a dictionary with the `id` and `version` of the base lexicon, otherwise it maps to :python:`None`. """ source = Path(source).expanduser() infos: list[ScanInfo] = [] lex_re = re.compile(b"<(Lexicon|LexiconExtension|Extends)\\b([^>]*)>", flags=re.M) attr_re = re.compile(b"""\\b(id|ref|version|label)=["']([^"']+)["']""", flags=re.M) with open(source, "rb") as fh: for m in lex_re.finditer(fh.read()): lextype, remainder = m.groups() attrs = { _m.group(1).decode("utf-8"): _m.group(2).decode("utf-8") for _m in attr_re.finditer(remainder) } info: ScanInfo = { "id": attrs.get("id", attrs.get("ref", "(unknown id)")), "version": attrs.get("version", "(unknown version)"), "label": attrs.get("label"), "extends": None, } if info["id"] is None or info["version"] is None: raise LMFError(f"<{lextype.decode('utf-8')}> missing id or version") if lextype != b"Extends": infos.append(info) elif len(infos) > 0: infos[-1]["extends"] = {"id": info["id"], "version": info["version"]} else: raise LMFError("invalid use of in WN-LMF file") return infos _Elem = dict[str, Any] # basic type for the loaded XML data def load( source: AnyPath, progress_handler: type[ProgressHandler] | None = ProgressBar ) -> LexicalResource: """Load wordnets encoded in the WN-LMF format. Args: source: path to a WN-LMF file """ source = Path(source).expanduser() if progress_handler is None: progress_handler = ProgressHandler version, num_elements = _quick_scan(source) progress = progress_handler( message="Read", total=num_elements, refresh_interval=10000 ) root: dict[str, _Elem] = {} parser = _make_parser(root, version, progress) with open(source, "rb") as fh: try: parser.ParseFile(fh) except xml.parsers.expat.ExpatError as exc: raise LMFError("invalid or ill-formed XML") from exc progress.close() resource: LexicalResource = { "lmf_version": version, "lexicons": [ _validate(lex) for lex in root["lexical-resource"].get("lexicons", []) ], } return resource def _quick_scan(source: Path) -> tuple[str, int]: with source.open("rb") as fh: version = _read_header(fh) # _read_header() only reads the first 2 lines remainder = fh.read() num_elements = remainder.count(b"") return version, num_elements def _make_parser(root, version, progress): # noqa: C901 stack = [root] ELEMS = _VALID_ELEMS[version] NS_ATTRS = _NS_ATTRS[version] CDATA_ELEMS = _CDATA_ELEMS & set(ELEMS) LIST_ELEMS = _LIST_ELEMS & set(ELEMS) p = xml.parsers.expat.ParserCreate(namespace_separator=" ") def start(name, attrs): if name in _META_ELEMS: meta = {} for attr in list(attrs): if attr in NS_ATTRS: meta[NS_ATTRS[attr]] = attrs.pop(attr) attrs["meta"] = meta or None if name in CDATA_ELEMS: attrs["text"] = "" if name.startswith("External"): attrs["external"] = True parent = stack[-1] key = ELEMS.get(name) if name in LIST_ELEMS: parent.setdefault(key, []).append(attrs) elif key is None or key in parent: raise _unexpected(name, p) else: parent[key] = attrs stack.append(attrs) def char_data(data): parent = stack[-1] if "text" in parent: # sometimes the buffering occurs in the middle of text, so # append the current data, don't just assign it parent["text"] += data def end(name): elem = stack.pop() # normalize whitespace unless xml:space=preserve if "text" in elem and elem.get(_XMLSPACEATTR, "") != "preserve": elem["text"] = " ".join(elem["text"].split()) progress.update(force=(name == "LexicalResource")) p.StartElementHandler = start p.EndElementHandler = end p.CharacterDataHandler = char_data return p def _unexpected(name: str, p: xml.parsers.expat.XMLParserType) -> LMFError: return LMFError(f"unexpected element at line {p.CurrentLineNumber}: {name}") # Validation ########################################################### def _validate(elem: _Elem) -> Lexicon | LexiconExtension: ext = elem.get("extends") if ext: if "ref" in ext: ext["id"] = ext.pop("ref") # normalize ref to id internally assert "id" in ext assert "version" in ext _validate_lexicon(elem, True) return cast("LexiconExtension", elem) else: _validate_lexicon(elem, False) return cast("Lexicon", elem) def _validate_lexicon(elem: _Elem, extension: bool) -> None: for attr in "id", "version", "label", "language", "email", "license": assert attr in elem, f" missing required attribute: {attr}" for dep in elem.get("requires", []): if "ref" in dep: dep["id"] = dep.pop("ref") # normalize ref to id internally assert "id" in dep assert "version" in dep _validate_entries(elem.get("entries", []), extension) _validate_synsets(elem.get("synsets", []), extension) _validate_frames(elem.get("frames", [])) def _validate_entries(elems: list[_Elem], extension: bool) -> None: for elem in elems: assert "id" in elem if not extension: assert not elem.get("external") lemma = elem.get("lemma") if not elem.get("external"): assert lemma is not None elem.setdefault("meta") # lemma and forms are the same except for partOfSpeech and id if lemma is not None and not lemma.get("external"): assert "partOfSpeech" in lemma for form in elem.get("forms", []): assert not form.get("external") or form.get("id") _validate_forms(([lemma] if lemma else []) + elem.get("forms", []), extension) _validate_senses(elem.get("senses", []), extension) _validate_frames(elem.get("frames", [])) def _validate_forms(elems: list[_Elem], extension: bool) -> None: for elem in elems: if not extension: assert not elem.get("external") if not elem.get("external"): assert "writtenForm" in elem for pron in elem.get("pronunciations", []): pron.setdefault("text", "") if pron.get("phonemic"): pron["phonemic"] = pron["phonemic"] != "false" for tag in elem.get("tags", []): tag.setdefault("text", "") assert "category" in tag def _validate_senses(elems: list[_Elem], extension: bool) -> None: for elem in elems: assert "id" in elem if not extension: assert not elem.get("external") if not elem.get("external"): assert "synset" in elem elem.setdefault("meta") for rel in elem.get("relations", []): assert "target" in rel assert "relType" in rel rel.setdefault("meta") for ex in elem.get("examples", []): ex.setdefault("text", "") ex.setdefault("meta") for cnt in elem.get("counts", []): assert "text" in cnt cnt["value"] = int(cnt.pop("text")) cnt.setdefault("meta") if elem.get("lexicalized"): elem["lexicalized"] = elem["lexicalized"] != "false" if elem.get("subcat"): elem["subcat"] = elem["subcat"].split() if elem.get("n"): elem["n"] = int(elem["n"]) def _validate_frames(elems: list[_Elem]) -> None: for elem in elems: assert "subcategorizationFrame" in elem if elem.get("senses"): elem["senses"] = elem["senses"].split() def _validate_synsets(elems: list[_Elem], extension: bool) -> None: for elem in elems: assert "id" in elem if not extension: assert not elem.get("external") if not elem.get("external"): assert "ili" in elem elem.setdefault("meta") for defn in elem.get("definitions", []): defn.setdefault("text", "") defn.setdefault("meta") for rel in elem.get("relations", []): assert "target" in rel assert "relType" in rel rel.setdefault("meta") for ex in elem.get("examples", []): ex.setdefault("text", "") ex.setdefault("meta") if elem.get("lexicalized"): elem["lexicalized"] = elem["lexicalized"] != "false" if elem.get("members"): elem["members"] = elem["members"].split() def _validate_metadata(elem: _Elem) -> None: if elem.get("confidenceScore"): elem["confidenceScore"] = float(elem["confidenceScore"]) # Serialization ######################################################## def dump(resource: LexicalResource, destination: AnyPath) -> None: """Write wordnets in the WN-LMF format. Args: lexicons: a list of :class:`Lexicon` objects """ version = resource["lmf_version"] if version not in SUPPORTED_VERSIONS: raise LMFError(f"invalid version: {version}") destination = Path(destination).expanduser() doctype = _DOCTYPE.format(schema=_SCHEMAS[version]) dc_uri = _DC_URIS[version] _version = version_info(version) with destination.open("wt", encoding="utf-8") as out: print(_XMLDECL.decode("utf-8"), file=out) print(doctype, file=out) print(f'', file=out) for lexicon in resource["lexicons"]: _dump_lexicon(lexicon, out, _version) print("", file=out) def _dump_lexicon( lexicon: Lexicon | LexiconExtension, out: TextIO, version: VersionInfo ) -> None: lexicontype = "LexiconExtension" if lexicon.get("extends") else "Lexicon" attrib = _build_lexicon_attrib(lexicon, version) attrdelim = "\n" + (" " * len(f" <{lexicontype} ")) attrs = attrdelim.join( f"{attr}={quoteattr(str(val))}" for attr, val in attrib.items() ) print(f" <{lexicontype} {attrs}>", file=out) if version >= (1, 1): if lexicontype == "LexiconExtension": assert lexicon.get("extends") lexicon = cast("LexiconExtension", lexicon) _dump_dependency(lexicon["extends"], "Extends", out, version) for req in lexicon.get("requires", []): _dump_dependency(req, "Requires", out, version) for entry in lexicon.get("entries", []): _dump_lexical_entry(entry, out, version) for synset in lexicon.get("synsets", []): _dump_synset(synset, out, version) if version >= (1, 1): for sb in lexicon.get("frames", []): _dump_syntactic_behaviour(sb, out, version) print(f" ", file=out) def _build_lexicon_attrib( lexicon: Lexicon | LexiconExtension, version: VersionInfo ) -> dict[str, str]: attrib = { "id": lexicon["id"], "label": lexicon["label"], "language": lexicon["language"], "email": lexicon["email"], "license": lexicon["license"], "version": lexicon["version"], } if lexicon.get("url"): attrib["url"] = lexicon["url"] if lexicon.get("citation"): attrib["citation"] = lexicon["citation"] if version >= (1, 1) and lexicon.get("logo"): attrib["logo"] = lexicon["logo"] attrib.update(_meta_dict(lexicon.get("meta"))) return attrib def _dump_dependency( dep: Dependency, deptype: str, out: TextIO, version: VersionInfo ) -> None: id_ref_key = "id" if version < (1, 4) else "ref" attrib = {id_ref_key: dep["id"], "version": dep["version"]} if (url := dep.get("url")) is not None: attrib["url"] = url elem = ET.Element(deptype, attrib=attrib) print(_tostring(elem, 2), file=out) def _dump_lexical_entry( entry: LexicalEntry | ExternalLexicalEntry, out: TextIO, version: VersionInfo, ) -> None: frames = [] attrib = {"id": entry["id"]} if entry.get("external", False): elem = ET.Element("ExternalLexicalEntry", attrib=attrib) if (lemma := entry.get("lemma")) is not None: assert lemma.get("external", False) elem.append(_build_lemma(lemma, version)) else: entry = cast("LexicalEntry", entry) if version >= (1, 4) and entry.get("index"): attrib["index"] = entry["index"] attrib.update(_meta_dict(entry.get("meta"))) elem = ET.Element("LexicalEntry", attrib=attrib) elem.append(_build_lemma(entry["lemma"], version)) if version < (1, 1): frames = [ _build_syntactic_behaviour(sb, version) for sb in entry.get("frames", []) ] elem.extend([_build_form(form, version) for form in entry.get("forms", [])]) elem.extend([_build_sense(sense, version) for sense in entry.get("senses", [])]) elem.extend(frames) print(_tostring(elem, 2), file=out) def _build_lemma(lemma: Lemma | ExternalLemma, version: VersionInfo) -> ET.Element: if lemma.get("external", False): elem = ET.Element("ExternalLemma") else: lemma = cast("Lemma", lemma) attrib = {"writtenForm": lemma["writtenForm"]} if lemma.get("script"): attrib["script"] = lemma["script"] attrib["partOfSpeech"] = lemma["partOfSpeech"] elem = ET.Element("Lemma", attrib=attrib) if version >= (1, 1): for pron in lemma.get("pronunciations", []): elem.append(_build_pronunciation(pron)) for tag in lemma.get("tags", []): elem.append(_build_tag(tag)) return elem def _build_form(form: Form | ExternalForm, version: VersionInfo) -> ET.Element: attrib = {} if version >= (1, 1) and form.get("id"): attrib["id"] = form["id"] if form.get("external", False): elem = ET.Element("ExternalForm", attrib=attrib) else: form = cast("Form", form) attrib["writtenForm"] = form["writtenForm"] if form.get("script"): attrib["script"] = form["script"] elem = ET.Element("Form", attrib=attrib) if version >= (1, 1): for pron in form.get("pronunciations", []): elem.append(_build_pronunciation(pron)) for tag in form.get("tags", []): elem.append(_build_tag(tag)) return elem def _build_pronunciation(pron: Pronunciation) -> ET.Element: attrib = {} if pron.get("variety"): attrib["variety"] = pron["variety"] if pron.get("notation"): attrib["notation"] = pron["notation"] if not pron.get("phonemic", True): attrib["phonemic"] = "false" if pron.get("audio"): attrib["audio"] = pron["audio"] elem = ET.Element("Pronunciation", attrib=attrib) elem.text = pron["text"] return elem def _build_tag(tag: Tag) -> ET.Element: elem = ET.Element("Tag", category=tag["category"]) elem.text = tag["text"] return elem def _build_sense( sense: Sense | ExternalSense, version: VersionInfo, ) -> ET.Element: attrib = {"id": sense["id"]} if sense.get("external"): elem = ET.Element("ExternalSense", attrib=attrib) else: sense = cast("Sense", sense) attrib["synset"] = sense["synset"] if version >= (1, 4) and sense.get("n"): attrib["n"] = str(sense["n"]) attrib.update(_meta_dict(sense.get("meta"))) if not sense.get("lexicalized", True): attrib["lexicalized"] = "false" if sense.get("adjposition"): attrib["adjposition"] = sense["adjposition"] if version >= (1, 1) and sense.get("subcat"): attrib["subcat"] = " ".join(sense["subcat"]) elem = ET.Element("Sense", attrib=attrib) elem.extend( [_build_relation(rel, "SenseRelation") for rel in sense.get("relations", [])] ) elem.extend([_build_example(ex) for ex in sense.get("examples", [])]) elem.extend([_build_count(cnt) for cnt in sense.get("counts", [])]) return elem def _build_example(example: Example) -> ET.Element: attrib: dict[str, str] = {} if example.get("language"): attrib["language"] = example["language"] attrib.update(_meta_dict(example.get("meta"))) elem = ET.Element("Example", attrib=attrib) elem.text = example["text"] return elem def _build_count(count: Count) -> ET.Element: elem = ET.Element("Count", attrib=_meta_dict(count.get("meta"))) elem.text = str(count["value"]) return elem def _dump_synset( synset: Synset | ExternalSynset, out: TextIO, version: VersionInfo ) -> None: attrib: dict[str, str] = {"id": synset["id"]} if synset.get("external", False): elem = ET.Element("ExternalSynset", attrib=attrib) elem.extend([_build_definition(defn) for defn in synset.get("definitions", [])]) else: synset = cast("Synset", synset) attrib["ili"] = synset["ili"] if synset.get("partOfSpeech"): attrib["partOfSpeech"] = synset["partOfSpeech"] if not synset.get("lexicalized", True): attrib["lexicalized"] = "false" if version >= (1, 1): if synset.get("members"): attrib["members"] = " ".join(synset["members"]) if synset.get("lexfile"): attrib["lexfile"] = synset["lexfile"] attrib.update(_meta_dict(synset.get("meta"))) elem = ET.Element("Synset", attrib=attrib) elem.extend([_build_definition(defn) for defn in synset.get("definitions", [])]) if synset.get("ili_definition"): elem.append(_build_ili_definition(synset["ili_definition"])) elem.extend( [_build_relation(rel, "SynsetRelation") for rel in synset.get("relations", [])] ) elem.extend([_build_example(ex) for ex in synset.get("examples", [])]) print(_tostring(elem, 2), file=out) def _build_definition(definition: Definition) -> ET.Element: attrib = {} if definition.get("language"): attrib["language"] = definition["language"] if definition.get("sourceSense"): attrib["sourceSense"] = definition["sourceSense"] attrib.update(_meta_dict(definition.get("meta"))) elem = ET.Element("Definition", attrib=attrib) elem.text = definition["text"] return elem def _build_ili_definition(ili_definition: ILIDefinition) -> ET.Element: elem = ET.Element("ILIDefinition", attrib=_meta_dict(ili_definition.get("meta"))) elem.text = ili_definition["text"] return elem def _build_relation(relation: Relation, elemtype: str) -> ET.Element: attrib = {"target": relation["target"], "relType": relation["relType"]} attrib.update(_meta_dict(relation.get("meta"))) return ET.Element(elemtype, attrib=attrib) def _dump_syntactic_behaviour( syntactic_behaviour: SyntacticBehaviour, out: TextIO, version: VersionInfo ) -> None: elem = _build_syntactic_behaviour(syntactic_behaviour, version) print(_tostring(elem, 2), file=out) def _build_syntactic_behaviour( syntactic_behaviour: SyntacticBehaviour, version: VersionInfo ) -> ET.Element: attrib = {"subcategorizationFrame": syntactic_behaviour["subcategorizationFrame"]} if version >= (1, 1) and syntactic_behaviour.get("id"): attrib["id"] = syntactic_behaviour["id"] elif version < (1, 1) and syntactic_behaviour.get("senses"): attrib["senses"] = " ".join(syntactic_behaviour["senses"]) return ET.Element("SyntacticBehaviour", attrib=attrib) def _tostring(elem: ET.Element, level: int, short_empty_elements: bool = True) -> str: _indent(elem, level) return (" " * level) + ET.tostring( elem, encoding="unicode", short_empty_elements=short_empty_elements ) def _indent(elem: ET.Element, level: int) -> None: self_indent = "\n" + " " * level child_indent = self_indent + " " if len(elem): if not elem.text or not elem.text.strip(): elem.text = child_indent for child in elem[:-1]: _indent(child, level + 1) child.tail = child_indent _indent(elem[-1], level + 1) elem[-1].tail = self_indent def _meta_dict(meta: Metadata | None) -> dict[str, str]: if meta is not None: # Literal keys are required for typing purposes, so first # construct the dict and then remove those that weren't specified. d = { "dc:contributor": meta.get("contributor", ""), "dc:coverage": meta.get("coverage", ""), "dc:creator": meta.get("creator", ""), "dc:date": meta.get("date", ""), "dc:description": meta.get("description", ""), "dc:format": meta.get("format", ""), "dc:identifier": meta.get("identifier", ""), "dc:publisher": meta.get("publisher", ""), "dc:relation": meta.get("relation", ""), "dc:rights": meta.get("rights", ""), "dc:source": meta.get("source", ""), "dc:subject": meta.get("subject", ""), "dc:title": meta.get("title", ""), "dc:type": meta.get("type", ""), "status": meta.get("status", ""), "note": meta.get("note", ""), } d = {key: val for key, val in d.items() if val} # this one requires a conversion, so do it separately if "confidenceScore" in meta: d["confidenceScore"] = str(meta["confidenceScore"]) else: d = {} return d wn-1.0.0/wn/metrics.py000066400000000000000000000004131513755206300146050ustar00rootroot00000000000000from wn._core import Synset, Word # Word-based Metrics def ambiguity(word: Word) -> int: return len(word.synsets()) def average_ambiguity(synset: Synset) -> float: words = synset.words() return sum(len(word.synsets()) for word in words) / len(words) wn-1.0.0/wn/morphy.py000066400000000000000000000114251513755206300144620ustar00rootroot00000000000000"""A simple English lemmatizer that finds and removes known suffixes.""" from enum import Flag, auto from typing import TypeAlias import wn from wn._types import LemmatizeResult from wn.constants import ADJ, ADJ_SAT, ADV, NOUN, PARTS_OF_SPEECH, VERB POSExceptionMap: TypeAlias = dict[str, set[str]] ExceptionMap: TypeAlias = dict[str, POSExceptionMap] class _System(Flag): """Flags to track suffix rules in various implementations of Morphy.""" PWN = auto() NLTK = auto() WN = auto() ALL = PWN | NLTK | WN _PWN = _System.PWN _NLTK = _System.NLTK _WN = _System.WN _ALL = _System.ALL Rule: TypeAlias = tuple[str, str, _System] DETACHMENT_RULES: dict[str, list[Rule]] = { NOUN: [ ("s", "", _ALL), ("ces", "x", _WN), ("ses", "s", _ALL), ("ves", "f", _NLTK | _WN), ("ives", "ife", _WN), ("xes", "x", _ALL), ("xes", "xis", _WN), ("zes", "z", _ALL), ("ches", "ch", _ALL), ("shes", "sh", _ALL), ("men", "man", _ALL), ("ies", "y", _ALL), ], VERB: [ ("s", "", _ALL), ("ies", "y", _ALL), ("es", "e", _ALL), ("es", "", _ALL), ("ed", "e", _ALL), ("ed", "", _ALL), ("ing", "e", _ALL), ("ing", "", _ALL), ], ADJ: [ ("er", "", _ALL), ("est", "", _ALL), ("er", "e", _ALL), ("est", "e", _ALL), ], ADV: [], } DETACHMENT_RULES[ADJ_SAT] = DETACHMENT_RULES[ADJ] class Morphy: """The Morphy lemmatizer class. Objects of this class are callables that take a wordform and an optional part of speech and return a dictionary mapping parts of speech to lemmas. If objects of this class are not created with a :class:`wn.Wordnet` object, the returned lemmas may be invalid. Arguments: wordnet: optional :class:`wn.Wordnet` instance Example: >>> import wn >>> from wn.morphy import Morphy >>> ewn = wn.Wordnet("ewn:2020") >>> m = Morphy(ewn) >>> m("axes", pos="n") {'n': {'axe', 'ax', 'axis'}} >>> m("geese", pos="n") {'n': {'goose'}} >>> m("gooses") {'n': {'goose'}, 'v': {'goose'}} >>> m("goosing") {'v': {'goose'}} """ def __init__(self, wordnet: wn.Wordnet | None = None): self._rules = { pos: [rule for rule in rules if rule[2] & _System.WN] for pos, rules in DETACHMENT_RULES.items() } exceptions: ExceptionMap = {pos: {} for pos in PARTS_OF_SPEECH} all_lemmas: dict[str, set[str]] = {pos: set() for pos in PARTS_OF_SPEECH} if wordnet: for word in wordnet.words(): pos = word.pos pos_exc = exceptions[pos] lemma, *others = word.forms() # store every lemma whether it has other forms or not all_lemmas[pos].add(lemma) # those with other forms map to the original lemmas for other in others: if other in pos_exc: pos_exc[other].add(lemma) else: pos_exc[other] = {lemma} self._initialized = True else: self._initialized = False self._exceptions = exceptions self._all_lemmas = all_lemmas def __call__(self, form: str, pos: str | None = None) -> LemmatizeResult: result = {} if not self._initialized: result[pos] = {form} # always include original when not initialized if pos is None: pos_list = list(DETACHMENT_RULES) elif pos in DETACHMENT_RULES: pos_list = [pos] else: pos_list = [] # not handled by morphy no_pos_forms = result.get(None, set()) # avoid unnecessary duplicates for _pos in pos_list: candidates = self._morphstr(form, _pos) - no_pos_forms if candidates: result.setdefault(_pos, set()).update(candidates) return result def _morphstr(self, form: str, pos: str) -> set[str]: candidates: set[str] = set() initialized = self._initialized if initialized: all_lemmas = self._all_lemmas[pos] if form in all_lemmas: candidates.add(form) candidates.update(self._exceptions[pos].get(form, set())) else: all_lemmas = set() for suffix, repl, _ in self._rules[pos]: # avoid applying rules that perform full suppletion if form.endswith(suffix) and len(suffix) < len(form): candidate = f"{form[: -len(suffix)]}{repl}" if not initialized or candidate in all_lemmas: candidates.add(candidate) return candidates morphy = Morphy() wn-1.0.0/wn/project.py000066400000000000000000000247171513755206300146220ustar00rootroot00000000000000""" Wordnet and ILI Packages and Collections """ import gzip import lzma import shutil import tarfile import tempfile from collections.abc import Iterator from pathlib import Path from wn import ili, lmf from wn._config import config from wn._exceptions import Error from wn._types import AnyPath from wn._util import is_gzip, is_lzma from wn.constants import _ILI, _WORDNET _ADDITIONAL_FILE_SUFFIXES = ("", ".txt", ".md", ".rst") def is_package_directory(path: AnyPath) -> bool: """Return ``True`` if *path* appears to be a wordnet or ILI package.""" path = Path(path).expanduser() return len(_package_directory_types(path)) == 1 def _package_directory_types(path: Path) -> list[tuple[Path, str]]: types: list[tuple[Path, str]] = [] if path.is_dir(): for p in path.iterdir(): typ = _resource_file_type(p) if typ is not None: types.append((p, typ)) return types def _resource_file_type(path: Path) -> str | None: if lmf.is_lmf(path): return _WORDNET elif ili.is_ili_tsv(path): return _ILI return None def is_collection_directory(path: AnyPath) -> bool: """Return ``True`` if *path* appears to be a wordnet collection.""" path = Path(path).expanduser() return ( path.is_dir() and len(list(filter(is_package_directory, path.iterdir()))) >= 1 ) class Project: """The base class for packages and collections.""" __slots__ = ("_path",) def __init__(self, path: AnyPath): self._path: Path = Path(path).expanduser() @property def path(self) -> Path: """The path of the project directory or resource file. For :class:`Package` and :class:`Collection` objects, the path is its directory. For :class:`ResourceOnlyPackage` objects, the path is the same as from :meth:`resource_file() ` """ return self._path def readme(self) -> Path | None: """Return the path of the README file, or :data:`None` if none exists.""" return self._find_file(self._path / "README", _ADDITIONAL_FILE_SUFFIXES) def license(self) -> Path | None: """Return the path of the license, or :data:`None` if none exists.""" return self._find_file(self._path / "LICENSE", _ADDITIONAL_FILE_SUFFIXES) def citation(self) -> Path | None: """Return the path of the citation, or :data:`None` if none exists.""" return self._find_file(self._path / "citation", (".bib",)) def _find_file(self, base: Path, suffixes: tuple[str, ...]) -> Path | None: for suffix in suffixes: base = base.with_suffix(suffix) if base.is_file(): return base return None class Package(Project): """A wordnet or ILI package. A package is a directory with a resource file and optional metadata files. """ @property def type(self) -> str | None: """Return the name of the type of resource contained by the package. Valid return values are: - :python:`"wordnet"` -- the resource is a WN-LMF lexicon file - :python:`"ili"` -- the resource is an interlingual index file - :data:`None` -- the resource type is undetermined """ return _resource_file_type(self.resource_file()) def resource_file(self) -> Path: """Return the path of the package's resource file.""" files = _package_directory_types(self._path) if not files: raise Error(f"no resource found in package: {self._path!s}") elif len(files) > 1: raise Error(f"multiple resource found in package: {self._path!s}") return files[0][0] class ResourceOnlyPackage(Package): """A virtual package for a single-file resource. This class is for resource files that are not distributed in a package directory. The :meth:`readme() `, :meth:`license() `, and :meth:`citation() ` methods all return :data:`None`. """ def resource_file(self) -> Path: return self._path def readme(self): return None def license(self): return None def citation(self): return None class Collection(Project): """A wordnet or ILI collection Collections are directories that contain package directories and optional metadata files. """ def packages(self) -> list[Package]: """Return the list of packages in the collection.""" return [ Package(path) for path in self._path.iterdir() if is_package_directory(path) ] def get_project( *, project: str | None = None, path: AnyPath | None = None, ) -> Project: """Return the :class:`Project` object for *project* or *path*. The *project* argument is a project specifier and will look in the download cache for the project data. If the project has not been downloaded and cached, an error will be raised. The *path* argument looks for project data at the given path. It can point to a resource file, a package directory, or a collection directory. Unlike :func:`iterpackages`, this function does not iterate over packages within a collection, and instead the :class:`Collection` object is returned. .. note:: If the target is compressed or archived, the data will be extracted to a temporary directory. It is the user's responsibility to delete this temporary directory, which is indicated by :data:`Project.path`. """ if project and path: raise TypeError("expected a project specifier or a path, not both") if not project and not path: raise TypeError("expected a project specifier or a path") if project: info = config.get_project_info(project) if not info["cache"]: raise Error(f"{project} is not cached; try `wn.download({project!r}` first") path = info["cache"] assert path proj, _ = _get_project_from_path(path) return proj def _get_project_from_path( path: AnyPath, tmp_path: Path | None = None, ) -> tuple[Project, Path | None]: path = Path(path).expanduser() if path.is_dir(): if is_package_directory(path): return Package(path), tmp_path elif is_collection_directory(path): return Collection(path), tmp_path else: raise Error( f"does not appear to be a valid package or collection: {path!s}" ) elif tarfile.is_tarfile(path): tmpdir_ = Path(tempfile.mkdtemp()) with tarfile.open(path) as tar: _check_tar(tar) tar.extractall(path=tmpdir_) contents = list(tmpdir_.iterdir()) if len(contents) != 1: raise Error( "archive may only have one resource, package, or collection" ) return _get_project_from_path(contents[0], tmp_path=tmpdir_) else: decompressed, tmp_path = _get_decompressed(path, tmp_path) if lmf.is_lmf(decompressed) or ili.is_ili_tsv(decompressed): return ResourceOnlyPackage(decompressed), tmp_path else: raise Error(f"not a valid lexical resource: {path!s}") def iterpackages(path: AnyPath, delete: bool = True) -> Iterator[Package]: """Yield any wordnet or ILI packages found at *path*. The *path* argument can point to one of the following: - a lexical resource file or ILI file - a wordnet package directory - a wordnet collection directory - a tar archive containing one of the above - a compressed (gzip or lzma) resource file or tar archive The *delete* argument determines whether any created temporary directories will be deleted after iteration is complete. When it is :data:`True`, the package objects can only be inspected during iteration. If one needs persistent objects (e.g., :python:`pkgs = list(iterpackages(...))`), then set *delete* to :data:`False`. .. warning:: When *delete* is set to :data:`False`, the user is responsible for cleaning up any temporary directories. The :data:`Project.path` attribute indicates the path of the temporary directory. """ project, tmp_path = _get_project_from_path(path) try: match project: case Package(): yield project case Collection(): yield from project.packages() case _: raise Error(f"unexpected project type: {project.__class__.__name__}") finally: if tmp_path and delete: if tmp_path.is_dir(): shutil.rmtree(tmp_path) elif tmp_path.is_file(): tmp_path.unlink() else: raise Error(f"could not remove temporary path: {tmp_path}") def _get_decompressed( source: Path, tmp_path: Path | None, ) -> tuple[Path, Path | None]: gzipped = is_gzip(source) xzipped = is_lzma(source) if not (gzipped or xzipped): return source, tmp_path else: tmp = tempfile.NamedTemporaryFile(suffix=".xml", delete=False) # noqa: SIM115 path = Path(tmp.name) try: if gzipped: with gzip.open(source, "rb") as gzip_src: shutil.copyfileobj(gzip_src, tmp) else: # xzipped with lzma.open(source, "rb") as lzma_src: shutil.copyfileobj(lzma_src, tmp) tmp.close() # Windows cannot reliably reopen until it's closed except (OSError, EOFError, lzma.LZMAError) as exc: raise Error(f"could not decompress file: {source}") from exc # if tmp_path is not None, the compressed file was in a # temporary directory, so return that. Otherwise the new path # becomes the tmp_path return path, tmp_path or path def _check_tar(tar: tarfile.TarFile) -> None: """Check the tarfile to avoid potential security issues. Currently collections and packages have the following constraints: - Only regular files or directories - No paths starting with '/' or containing '..' """ for info in tar.getmembers(): if not (info.isfile() or info.isdir()): raise Error( f"tarfile member is not a regular file or directory: {info.name}" ) if info.name.startswith("/") or ".." in info.name: raise Error( f"tarfile member paths may not be absolute or contain ..: {info.name}" ) wn-1.0.0/wn/py.typed000066400000000000000000000000011513755206300142550ustar00rootroot00000000000000 wn-1.0.0/wn/schema.sql000066400000000000000000000234411513755206300145540ustar00rootroot00000000000000 -- ILI : Interlingual Index CREATE TABLE ilis ( rowid INTEGER PRIMARY KEY, id TEXT NOT NULL, status_rowid INTEGER NOT NULL REFERENCES ili_statuses (rowid), definition TEXT, metadata META, UNIQUE (id) ); CREATE INDEX ili_id_index ON ilis (id); CREATE TABLE proposed_ilis ( rowid INTEGER PRIMARY KEY, synset_rowid INTEGER REFERENCES synsets (rowid) ON DELETE CASCADE, definition TEXT, metadata META, UNIQUE (synset_rowid) ); CREATE INDEX proposed_ili_synset_rowid_index ON proposed_ilis (synset_rowid); -- Wordnet lexicons CREATE TABLE lexicons ( rowid INTEGER PRIMARY KEY, -- unique database-internal id specifier TEXT NOT NULL, -- lexicon specifer -> id:version id TEXT NOT NULL, -- user-facing id label TEXT NOT NULL, language TEXT NOT NULL, -- bcp-47 language tag email TEXT NOT NULL, license TEXT NOT NULL, version TEXT NOT NULL, url TEXT, citation TEXT, logo TEXT, metadata META, modified BOOLEAN CHECK( modified IN (0, 1) ) DEFAULT 0 NOT NULL, UNIQUE (id, version), UNIQUE (specifier) ); CREATE INDEX lexicon_specifier_index ON lexicons (specifier); CREATE TABLE lexicon_dependencies ( dependent_rowid INTEGER NOT NULL REFERENCES lexicons (rowid) ON DELETE CASCADE, provider_id TEXT NOT NULL, provider_version TEXT NOT NULL, provider_url TEXT, provider_rowid INTEGER REFERENCES lexicons (rowid) ON DELETE SET NULL ); CREATE INDEX lexicon_dependent_index ON lexicon_dependencies(dependent_rowid); CREATE TABLE lexicon_extensions ( extension_rowid INTEGER NOT NULL REFERENCES lexicons (rowid) ON DELETE CASCADE, base_id TEXT NOT NULL, base_version TEXT NOT NULL, base_url TEXT, base_rowid INTEGER REFERENCES lexicons (rowid), UNIQUE (extension_rowid, base_rowid) ); CREATE INDEX lexicon_extension_index ON lexicon_extensions(extension_rowid); -- Lexical Entries CREATE TABLE entry_index ( entry_rowid INTEGER NOT NULL REFERENCES entries (rowid) ON DELETE CASCADE, lemma TEXT NOT NULL, UNIQUE (entry_rowid) ); CREATE INDEX entry_index_entry_index ON entry_index(entry_rowid); CREATE INDEX entry_index_lemma_index ON entry_index(lemma); /* The 'lemma' entity of a lexical entry is just a form, but it should be the only form with rank = 0. After that, rank can be used to indicate preference for a form. */ CREATE TABLE entries ( rowid INTEGER PRIMARY KEY, id TEXT NOT NULL, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons (rowid) ON DELETE CASCADE, pos TEXT NOT NULL, metadata META, UNIQUE (id, lexicon_rowid) ); CREATE INDEX entry_id_index ON entries (id); CREATE TABLE forms ( rowid INTEGER PRIMARY KEY, id TEXT, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons(rowid) ON DELETE CASCADE, entry_rowid INTEGER NOT NULL REFERENCES entries(rowid) ON DELETE CASCADE, form TEXT NOT NULL, normalized_form TEXT, script TEXT, rank INTEGER DEFAULT 1, -- rank 0 is the preferred lemma UNIQUE (entry_rowid, form, script) ); CREATE INDEX form_entry_index ON forms (entry_rowid); CREATE INDEX form_index ON forms (form); CREATE INDEX form_norm_index ON forms (normalized_form); CREATE TABLE pronunciations ( form_rowid INTEGER NOT NULL REFERENCES forms (rowid) ON DELETE CASCADE, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons(rowid) ON DELETE CASCADE, value TEXT, variety TEXT, notation TEXT, phonemic BOOLEAN CHECK( phonemic IN (0, 1) ) DEFAULT 1 NOT NULL, audio TEXT ); CREATE INDEX pronunciation_form_index ON pronunciations (form_rowid); CREATE TABLE tags ( form_rowid INTEGER NOT NULL REFERENCES forms (rowid) ON DELETE CASCADE, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons(rowid) ON DELETE CASCADE, tag TEXT, category TEXT ); CREATE INDEX tag_form_index ON tags (form_rowid); -- Synsets CREATE TABLE synsets ( rowid INTEGER PRIMARY KEY, id TEXT NOT NULL, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons (rowid) ON DELETE CASCADE, ili_rowid INTEGER REFERENCES ilis (rowid), pos TEXT, lexfile_rowid INTEGER REFERENCES lexfiles (rowid), metadata META ); CREATE INDEX synset_id_index ON synsets (id); CREATE INDEX synset_ili_rowid_index ON synsets (ili_rowid); CREATE TABLE unlexicalized_synsets ( synset_rowid INTEGER NOT NULL REFERENCES synsets (rowid) ON DELETE CASCADE ); CREATE INDEX unlexicalized_synsets_index ON unlexicalized_synsets (synset_rowid); CREATE TABLE synset_relations ( rowid INTEGER PRIMARY KEY, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons (rowid) ON DELETE CASCADE, source_rowid INTEGER NOT NULL REFERENCES synsets(rowid) ON DELETE CASCADE, target_rowid INTEGER NOT NULL REFERENCES synsets(rowid) ON DELETE CASCADE, type_rowid INTEGER NOT NULL REFERENCES relation_types(rowid), metadata META ); CREATE INDEX synset_relation_source_index ON synset_relations (source_rowid); CREATE INDEX synset_relation_target_index ON synset_relations (target_rowid); CREATE TABLE definitions ( rowid INTEGER PRIMARY KEY, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons(rowid) ON DELETE CASCADE, synset_rowid INTEGER NOT NULL REFERENCES synsets(rowid) ON DELETE CASCADE, definition TEXT, language TEXT, -- bcp-47 language tag sense_rowid INTEGER REFERENCES senses(rowid) ON DELETE SET NULL, metadata META ); CREATE INDEX definition_rowid_index ON definitions (synset_rowid); CREATE INDEX definition_sense_index ON definitions (sense_rowid); CREATE TABLE synset_examples ( rowid INTEGER PRIMARY KEY, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons(rowid) ON DELETE CASCADE, synset_rowid INTEGER NOT NULL REFERENCES synsets(rowid) ON DELETE CASCADE, example TEXT, language TEXT, -- bcp-47 language tag metadata META ); CREATE INDEX synset_example_rowid_index ON synset_examples(synset_rowid); -- Senses CREATE TABLE senses ( rowid INTEGER PRIMARY KEY, id TEXT NOT NULL, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons(rowid) ON DELETE CASCADE, entry_rowid INTEGER NOT NULL REFERENCES entries(rowid) ON DELETE CASCADE, entry_rank INTEGER DEFAULT 1, synset_rowid INTEGER NOT NULL REFERENCES synsets(rowid) ON DELETE CASCADE, synset_rank INTEGER DEFAULT 1, metadata META ); CREATE INDEX sense_id_index ON senses(id); CREATE INDEX sense_entry_rowid_index ON senses (entry_rowid); CREATE INDEX sense_synset_rowid_index ON senses (synset_rowid); CREATE TABLE unlexicalized_senses ( sense_rowid INTEGER NOT NULL REFERENCES senses (rowid) ON DELETE CASCADE ); CREATE INDEX unlexicalized_senses_index ON unlexicalized_senses (sense_rowid); CREATE TABLE sense_relations ( rowid INTEGER PRIMARY KEY, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons (rowid) ON DELETE CASCADE, source_rowid INTEGER NOT NULL REFERENCES senses(rowid) ON DELETE CASCADE, target_rowid INTEGER NOT NULL REFERENCES senses(rowid) ON DELETE CASCADE, type_rowid INTEGER NOT NULL REFERENCES relation_types(rowid), metadata META ); CREATE INDEX sense_relation_source_index ON sense_relations (source_rowid); CREATE INDEX sense_relation_target_index ON sense_relations (target_rowid); CREATE TABLE sense_synset_relations ( rowid INTEGER PRIMARY KEY, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons (rowid) ON DELETE CASCADE, source_rowid INTEGER NOT NULL REFERENCES senses(rowid) ON DELETE CASCADE, target_rowid INTEGER NOT NULL REFERENCES synsets(rowid) ON DELETE CASCADE, type_rowid INTEGER NOT NULL REFERENCES relation_types(rowid), metadata META ); CREATE INDEX sense_synset_relation_source_index ON sense_synset_relations (source_rowid); CREATE INDEX sense_synset_relation_target_index ON sense_synset_relations (target_rowid); CREATE TABLE adjpositions ( sense_rowid INTEGER NOT NULL REFERENCES senses(rowid) ON DELETE CASCADE, adjposition TEXT NOT NULL ); CREATE INDEX adjposition_sense_index ON adjpositions (sense_rowid); CREATE TABLE sense_examples ( rowid INTEGER PRIMARY KEY, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons(rowid) ON DELETE CASCADE, sense_rowid INTEGER NOT NULL REFERENCES senses(rowid) ON DELETE CASCADE, example TEXT, language TEXT, -- bcp-47 language tag metadata META ); CREATE INDEX sense_example_index ON sense_examples (sense_rowid); CREATE TABLE counts ( rowid INTEGER PRIMARY KEY, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons(rowid) ON DELETE CASCADE, sense_rowid INTEGER NOT NULL REFERENCES senses(rowid) ON DELETE CASCADE, count INTEGER NOT NULL, metadata META ); CREATE INDEX count_index ON counts(sense_rowid); -- Syntactic Behaviours CREATE TABLE syntactic_behaviours ( rowid INTEGER PRIMARY KEY, id TEXT, lexicon_rowid INTEGER NOT NULL REFERENCES lexicons (rowid) ON DELETE CASCADE, frame TEXT NOT NULL, UNIQUE (lexicon_rowid, id), UNIQUE (lexicon_rowid, frame) ); CREATE INDEX syntactic_behaviour_id_index ON syntactic_behaviours (id); CREATE TABLE syntactic_behaviour_senses ( syntactic_behaviour_rowid INTEGER NOT NULL REFERENCES syntactic_behaviours (rowid) ON DELETE CASCADE, sense_rowid INTEGER NOT NULL REFERENCES senses (rowid) ON DELETE CASCADE ); CREATE INDEX syntactic_behaviour_sense_sb_index ON syntactic_behaviour_senses (syntactic_behaviour_rowid); CREATE INDEX syntactic_behaviour_sense_sense_index ON syntactic_behaviour_senses (sense_rowid); -- Lookup Tables CREATE TABLE relation_types ( rowid INTEGER PRIMARY KEY, type TEXT NOT NULL, UNIQUE (type) ); CREATE INDEX relation_type_index ON relation_types (type); CREATE TABLE ili_statuses ( rowid INTEGER PRIMARY KEY, status TEXT NOT NULL, UNIQUE (status) ); CREATE INDEX ili_status_index ON ili_statuses (status); CREATE TABLE lexfiles ( rowid INTEGER PRIMARY KEY, name TEXT NOT NULL, UNIQUE (name) ); CREATE INDEX lexfile_index ON lexfiles (name); wn-1.0.0/wn/similarity.py000066400000000000000000000176101513755206300153340ustar00rootroot00000000000000"""Synset similarity metrics.""" import math import wn from wn._core import Synset from wn.constants import ADJ, ADJ_SAT from wn.ic import Freq, information_content def path(synset1: Synset, synset2: Synset, simulate_root: bool = False) -> float: """Return the Path similarity of *synset1* and *synset2*. Arguments: synset1: The first synset to compare. synset2: The second synset to compare. simulate_root: When :python:`True`, a fake root node connects all other roots; default: :python:`False`. Example: >>> import wn >>> from wn.similarity import path >>> ewn = wn.Wordnet("ewn:2020") >>> spatula = ewn.synsets("spatula")[0] >>> path(spatula, ewn.synsets("pancake")[0]) 0.058823529411764705 >>> path(spatula, ewn.synsets("utensil")[0]) 0.2 >>> path(spatula, spatula) 1.0 >>> flip = ewn.synsets("flip", pos="v")[0] >>> turn_over = ewn.synsets("turn over", pos="v")[0] >>> path(flip, turn_over) 0.0 >>> path(flip, turn_over, simulate_root=True) 0.16666666666666666 """ _check_if_pos_compatible(synset1.pos, synset2.pos) try: path = synset1.shortest_path(synset2, simulate_root=simulate_root) except wn.Error: distance = float("inf") else: distance = len(path) return 1 / (distance + 1) def wup(synset1: Synset, synset2: Synset, simulate_root=False) -> float: """Return the Wu-Palmer similarity of *synset1* and *synset2*. Arguments: synset1: The first synset to compare. synset2: The second synset to compare. simulate_root: When :python:`True`, a fake root node connects all other roots; default: :python:`False`. Raises: wn.Error: When no path connects the *synset1* and *synset2*. Example: >>> import wn >>> from wn.similarity import wup >>> ewn = wn.Wordnet("ewn:2020") >>> spatula = ewn.synsets("spatula")[0] >>> wup(spatula, ewn.synsets("pancake")[0]) 0.2 >>> wup(spatula, ewn.synsets("utensil")[0]) 0.8 >>> wup(spatula, spatula) 1.0 >>> flip = ewn.synsets("flip", pos="v")[0] >>> turn_over = ewn.synsets("turn over", pos="v")[0] >>> wup(flip, turn_over, simulate_root=True) 0.2857142857142857 """ _check_if_pos_compatible(synset1.pos, synset2.pos) lcs_list = _least_common_subsumers(synset1, synset2, simulate_root) lcs = lcs_list[0] i = len(synset1.shortest_path(lcs, simulate_root=simulate_root)) j = len(synset2.shortest_path(lcs, simulate_root=simulate_root)) k = lcs.max_depth() + 1 return (2 * k) / (i + j + 2 * k) def lch( synset1: Synset, synset2: Synset, max_depth: int, simulate_root: bool = False ) -> float: """Return the Leacock-Chodorow similarity between *synset1* and *synset2*. Arguments: synset1: The first synset to compare. synset2: The second synset to compare. max_depth: The taxonomy depth (see :func:`wn.taxonomy.taxonomy_depth`) simulate_root: When :python:`True`, a fake root node connects all other roots; default: :python:`False`. Example: >>> import wn, wn.taxonomy >>> from wn.similarity import lch >>> ewn = wn.Wordnet("ewn:2020") >>> n_depth = wn.taxonomy.taxonomy_depth(ewn, "n") >>> spatula = ewn.synsets("spatula")[0] >>> lch(spatula, ewn.synsets("pancake")[0], n_depth) 0.8043728156701697 >>> lch(spatula, ewn.synsets("utensil")[0], n_depth) 2.0281482472922856 >>> lch(spatula, spatula, n_depth) 3.6375861597263857 >>> v_depth = taxonomy.taxonomy_depth(ewn, "v") >>> flip = ewn.synsets("flip", pos="v")[0] >>> turn_over = ewn.synsets("turn over", pos="v")[0] >>> lch(flip, turn_over, v_depth, simulate_root=True) 1.3862943611198906 """ _check_if_pos_compatible(synset1.pos, synset2.pos) distance = len(synset1.shortest_path(synset2, simulate_root=simulate_root)) if max_depth <= 0: raise wn.Error("max_depth must be greater than 0") return -math.log((distance + 1) / (2 * max_depth)) def res(synset1: Synset, synset2: Synset, ic: Freq) -> float: """Return the Resnik similarity between *synset1* and *synset2*. Arguments: synset1: The first synset to compare. synset2: The second synset to compare. ic: Information Content weights. Example: >>> import wn, wn.ic, wn.taxonomy >>> from wn.similarity import res >>> pwn = wn.Wordnet("pwn:3.0") >>> ic = wn.ic.load("~/nltk_data/corpora/wordnet_ic/ic-brown.dat", pwn) >>> spatula = pwn.synsets("spatula")[0] >>> res(spatula, pwn.synsets("pancake")[0], ic) 0.8017591149538994 >>> res(spatula, pwn.synsets("utensil")[0], ic) 5.87738923441087 """ _check_if_pos_compatible(synset1.pos, synset2.pos) lcs = _most_informative_lcs(synset1, synset2, ic) return information_content(lcs, ic) def jcn(synset1: Synset, synset2: Synset, ic: Freq) -> float: """Return the Jiang-Conrath similarity of two synsets. Arguments: synset1: The first synset to compare. synset2: The second synset to compare. ic: Information Content weights. Example: >>> import wn, wn.ic, wn.taxonomy >>> from wn.similarity import jcn >>> pwn = wn.Wordnet("pwn:3.0") >>> ic = wn.ic.load("~/nltk_data/corpora/wordnet_ic/ic-brown.dat", pwn) >>> spatula = pwn.synsets("spatula")[0] >>> jcn(spatula, pwn.synsets("pancake")[0], ic) 0.04061799236354239 >>> jcn(spatula, pwn.synsets("utensil")[0], ic) 0.10794048564613007 """ _check_if_pos_compatible(synset1.pos, synset2.pos) ic1 = information_content(synset1, ic) ic2 = information_content(synset2, ic) lcs = _most_informative_lcs(synset1, synset2, ic) ic_lcs = information_content(lcs, ic) if ic1 == ic2 == ic_lcs == 0: return 0 elif ic1 + ic2 == 2 * ic_lcs: return float("inf") else: return 1 / (ic1 + ic2 - 2 * ic_lcs) def lin(synset1: Synset, synset2: Synset, ic: Freq) -> float: """Return the Lin similarity of two synsets. Arguments: synset1: The first synset to compare. synset2: The second synset to compare. ic: Information Content weights. Example: >>> import wn, wn.ic, wn.taxonomy >>> from wn.similarity import lin >>> pwn = wn.Wordnet("pwn:3.0") >>> ic = wn.ic.load("~/nltk_data/corpora/wordnet_ic/ic-brown.dat", pwn) >>> spatula = pwn.synsets("spatula")[0] >>> lin(spatula, pwn.synsets("pancake")[0], ic) 0.061148956278604116 >>> lin(spatula, pwn.synsets("utensil")[0], ic) 0.5592415686750427 """ _check_if_pos_compatible(synset1.pos, synset2.pos) lcs = _most_informative_lcs(synset1, synset2, ic) ic1 = information_content(synset1, ic) ic2 = information_content(synset2, ic) if ic1 == 0 or ic2 == 0: return 0.0 return 2 * information_content(lcs, ic) / (ic1 + ic2) # Helper functions def _least_common_subsumers( synset1: Synset, synset2: Synset, simulate_root: bool ) -> list[Synset]: lcs = synset1.lowest_common_hypernyms(synset2, simulate_root=simulate_root) if not lcs: raise wn.Error(f"no common hypernyms for {synset1!r} and {synset2!r}") return lcs def _most_informative_lcs(synset1: Synset, synset2: Synset, ic: Freq) -> Synset: pos_ic = ic[synset1.pos] lcs = _least_common_subsumers(synset1, synset2, False) return max(lcs, key=lambda ss: pos_ic[ss.id]) def _check_if_pos_compatible(pos1: str, pos2: str) -> None: _pos1 = ADJ if pos1 == ADJ_SAT else pos1 _pos2 = ADJ if pos2 == ADJ_SAT else pos2 if _pos1 != _pos2: raise wn.Error("synsets must have the same part of speech") wn-1.0.0/wn/taxonomy.py000066400000000000000000000260171513755206300150250ustar00rootroot00000000000000"""Functions for working with hypernym/hyponym taxonomies.""" from __future__ import annotations import wn from wn._util import flatten from wn.constants import ADJ, ADJ_SAT _FAKE_ROOT = "*ROOT*" def roots(wordnet: wn.Wordnet, pos: str | None = None) -> list[wn.Synset]: """Return the list of root synsets in *wordnet*. Arguments: wordnet: The wordnet from which root synsets are found. pos: If given, only return synsets with the specified part of speech. Example: >>> import wn, wn.taxonomy >>> ewn = wn.Wordnet("ewn:2020") >>> len(wn.taxonomy.roots(ewn, pos="v")) 573 """ return [ss for ss in _synsets_for_pos(wordnet, pos) if not ss.hypernyms()] def leaves(wordnet: wn.Wordnet, pos: str | None = None) -> list[wn.Synset]: """Return the list of leaf synsets in *wordnet*. Arguments: wordnet: The wordnet from which leaf synsets are found. pos: If given, only return synsets with the specified part of speech. Example: >>> import wn, wn.taxonomy >>> ewn = wn.Wordnet("ewn:2020") >>> len(wn.taxonomy.leaves(ewn, pos="v")) 10525 """ return [ss for ss in _synsets_for_pos(wordnet, pos) if not ss.hyponyms()] def taxonomy_depth(wordnet: wn.Wordnet, pos: str) -> int: """Return the maximum depth of the taxonomy for the given part of speech. Arguments: wordnet: The wordnet for which the taxonomy depth will be calculated. pos: The part of speech for which the taxonomy depth will be calculated. Example: >>> import wn, wn.taxonomy >>> ewn = wn.Wordnet("ewn:2020") >>> wn.taxonomy.taxonomy_depth(ewn, "n") 19 """ seen: set[wn.Synset] = set() depth = 0 for ss in _synsets_for_pos(wordnet, pos): if all(hyp in seen for hyp in ss.hypernyms()): continue paths = ss.hypernym_paths() if paths: depth = max(depth, max(len(path) for path in paths)) seen.update(hyp for path in paths for hyp in path) return depth def _synsets_for_pos(wordnet: wn.Wordnet, pos: str | None) -> list[wn.Synset]: """Get the list of synsets for a part of speech. If *pos* is 'a' or 's', also include those for the other. """ synsets = wordnet.synsets(pos=pos) if pos == ADJ: synsets.extend(wordnet.synsets(pos=ADJ_SAT)) elif pos == ADJ_SAT: synsets.extend(wordnet.synsets(pos=ADJ)) return synsets def _hypernym_paths( synset: wn.Synset, simulate_root: bool, include_self: bool, ) -> list[list[wn.Synset]]: paths = list(synset.relation_paths("hypernym", "instance_hypernym")) if include_self: paths = [[synset, *path] for path in paths] or [[synset]] if simulate_root and synset.id != _FAKE_ROOT: root = wn.Synset.empty( id=_FAKE_ROOT, _lexicon=synset._lexicon, _lexconf=synset._lexconf ) paths = [[*path, root] for path in paths] or [[root]] return paths def hypernym_paths( synset: wn.Synset, simulate_root: bool = False, ) -> list[list[wn.Synset]]: """Return the list of hypernym paths to a root synset. Arguments: synset: The starting synset for paths to a root. simulate_root: If :python:`True`, find the path to a simulated root node. Example: >>> import wn, wn.taxonomy >>> dog = wn.synsets("dog", pos="n")[0] >>> for path in wn.taxonomy.hypernym_paths(dog): ... for i, ss in enumerate(path): ... print(" " * i, ss, ss.lemmas()[0]) Synset('pwn-02083346-n') canine Synset('pwn-02075296-n') carnivore Synset('pwn-01886756-n') eutherian mammal Synset('pwn-01861778-n') mammalian Synset('pwn-01471682-n') craniate Synset('pwn-01466257-n') chordate Synset('pwn-00015388-n') animal Synset('pwn-00004475-n') organism Synset('pwn-00004258-n') animate thing Synset('pwn-00003553-n') unit Synset('pwn-00002684-n') object Synset('pwn-00001930-n') physical entity Synset('pwn-00001740-n') entity Synset('pwn-01317541-n') domesticated animal Synset('pwn-00015388-n') animal Synset('pwn-00004475-n') organism Synset('pwn-00004258-n') animate thing Synset('pwn-00003553-n') unit Synset('pwn-00002684-n') object Synset('pwn-00001930-n') physical entity Synset('pwn-00001740-n') entity """ return _hypernym_paths(synset, simulate_root, False) def min_depth(synset: wn.Synset, simulate_root: bool = False) -> int: """Return the minimum taxonomy depth of the synset. Arguments: synset: The starting synset for paths to a root. simulate_root: If :python:`True`, find the depth to a simulated root node. Example: >>> import wn, wn.taxonomy >>> dog = wn.synsets("dog", pos="n")[0] >>> wn.taxonomy.min_depth(dog) 8 """ return min( (len(path) for path in synset.hypernym_paths(simulate_root=simulate_root)), default=0, ) def max_depth(synset: wn.Synset, simulate_root: bool = False) -> int: """Return the maximum taxonomy depth of the synset. Arguments: synset: The starting synset for paths to a root. simulate_root: If :python:`True`, find the depth to a simulated root node. Example: >>> import wn, wn.taxonomy >>> dog = wn.synsets("dog", pos="n")[0] >>> wn.taxonomy.max_depth(dog) 13 """ return max( (len(path) for path in synset.hypernym_paths(simulate_root=simulate_root)), default=0, ) def _shortest_hyp_paths( synset: wn.Synset, other: wn.Synset, simulate_root: bool, ) -> dict[tuple[wn.Synset, int], list[wn.Synset]]: if synset == other: return {(synset, 0): []} from_self = _hypernym_paths(synset, simulate_root, True) from_other = _hypernym_paths(other, simulate_root, True) common = set(flatten(from_self)).intersection(flatten(from_other)) if not common: return {} # Compute depths of common hypernyms from their distances. # Doing this now avoid more expensive lookups later. depths: dict[wn.Synset, int] = {} # subpaths accumulates paths to common hypernyms from both sides subpaths: dict[wn.Synset, tuple[list[list[wn.Synset]], list[list[wn.Synset]]]] subpaths = {ss: ([], []) for ss in common} for which, paths in (0, from_self), (1, from_other): for path in paths: for dist, ss in enumerate(path): if ss in common: # synset or other subpath to ss (not including ss) subpaths[ss][which].append(path[: dist + 1]) # keep maximum depth depth = len(path) - dist - 1 if ss not in depths or depths[ss] < depth: depths[ss] = depth shortest: dict[tuple[wn.Synset, int], list[wn.Synset]] = {} for ss in common: from_self_subpaths, from_other_subpaths = subpaths[ss] shortest_from_self = min(from_self_subpaths, key=len) # for the other path, we need to reverse it and remove the pivot synset # (ty doesn't infer the result of min() correctly, hence the ignore) shortest_from_other = min(from_other_subpaths, key=len)[-2::-1] # type: ignore shortest[(ss, depths[ss])] = shortest_from_self + shortest_from_other return shortest def shortest_path( synset: wn.Synset, other: wn.Synset, simulate_root: bool = False, ) -> list[wn.Synset]: """Return the shortest path from *synset* to the *other* synset. Arguments: other: endpoint synset of the path simulate_root: if :python:`True`, ensure any two synsets are always connected by positing a fake root node Example: >>> import wn, wn.taxonomy >>> dog = ewn.synsets("dog", pos="n")[0] >>> squirrel = ewn.synsets("squirrel", pos="n")[0] >>> for ss in wn.taxonomy.shortest_path(dog, squirrel): ... print(ss.lemmas()) ['canine', 'canid'] ['carnivore'] ['eutherian mammal', 'placental', 'placental mammal', 'eutherian'] ['rodent', 'gnawer'] ['squirrel'] """ pathmap = _shortest_hyp_paths(synset, other, simulate_root) key = min(pathmap, key=lambda key: len(pathmap[key]), default=None) if key is None: raise wn.Error(f"no path between {synset!r} and {other!r}") return pathmap[key][1:] def common_hypernyms( synset: wn.Synset, other: wn.Synset, simulate_root: bool = False, ) -> list[wn.Synset]: """Return the common hypernyms for the current and *other* synsets. Arguments: other: synset that is a hyponym of any shared hypernyms simulate_root: if :python:`True`, ensure any two synsets always share a hypernym by positing a fake root node Example: >>> import wn, wn.taxonomy >>> dog = ewn.synsets("dog", pos="n")[0] >>> squirrel = ewn.synsets("squirrel", pos="n")[0] >>> for ss in wn.taxonomy.common_hypernyms(dog, squirrel): ... print(ss.lemmas()) ['entity'] ['physical entity'] ['object', 'physical object'] ['unit', 'whole'] ['animate thing', 'living thing'] ['organism', 'being'] ['fauna', 'beast', 'animate being', 'brute', 'creature', 'animal'] ['chordate'] ['craniate', 'vertebrate'] ['mammalian', 'mammal'] ['eutherian mammal', 'placental', 'placental mammal', 'eutherian'] """ from_self = _hypernym_paths(synset, simulate_root, True) from_other = _hypernym_paths(other, simulate_root, True) common = set(flatten(from_self)).intersection(flatten(from_other)) return sorted(common, key=lambda ss: ss.id) def lowest_common_hypernyms( synset: wn.Synset, other: wn.Synset, simulate_root: bool = False, ) -> list[wn.Synset]: """Return the common hypernyms furthest from the root. Arguments: other: synset that is a hyponym of any shared hypernyms simulate_root: if :python:`True`, ensure any two synsets always share a hypernym by positing a fake root node Example: >>> import wn, wn.taxonomy >>> dog = ewn.synsets("dog", pos="n")[0] >>> squirrel = ewn.synsets("squirrel", pos="n")[0] >>> len(wn.taxonomy.lowest_common_hypernyms(dog, squirrel)) 1 >>> wn.taxonomy.lowest_common_hypernyms(dog, squirrel)[0].lemmas() ['eutherian mammal', 'placental', 'placental mammal', 'eutherian'] """ pathmap = _shortest_hyp_paths(synset, other, simulate_root) # keys of pathmap are (synset, depth_of_synset) max_depth: int = max([depth for _, depth in pathmap], default=-1) if max_depth == -1: return [] else: return [ss for ss, d in pathmap if d == max_depth] wn-1.0.0/wn/util.py000066400000000000000000000143521513755206300141230ustar00rootroot00000000000000"""Wn utility classes.""" import sys from collections.abc import Callable from typing import TextIO def synset_id_formatter(fmt: str = "{prefix}-{offset:08}-{pos}", **kwargs) -> Callable: """Return a function for formatting synset ids. The *fmt* argument can be customized. It will be formatted using any other keyword arguments given to this function and any given to the resulting function. By default, the format string expects a ``prefix`` string argument for the namespace (such as a lexicon id), an ``offset`` integer argument (such as a WNDB offset), and a ``pos`` string argument. Arguments: fmt: A Python format string **kwargs: Keyword arguments for the format string. Example: >>> pwn_synset_id = synset_id_formatter(prefix="pwn") >>> pwn_synset_id(offset=1174, pos="n") 'pwn-00001174-n' """ def format_synset_id(**_kwargs) -> str: return fmt.format(**kwargs, **_kwargs) return format_synset_id class ProgressHandler: """An interface for updating progress in long-running processes. Long-running processes in Wn, such as :func:`wn.download` and :func:`wn.add`, call to a progress handler object as they go. The default progress handler used by Wn is :class:`ProgressBar`, which updates progress by formatting and printing a textual bar to stderr. The :class:`ProgressHandler` class may be used directly, which does nothing, or users may create their own subclasses for, e.g., updating a GUI or some other handler. The initialization parameters, except for ``file``, are stored in a :attr:`kwargs` member and may be updated after the handler is created through the :meth:`set` method. The :meth:`update` method is the primary way a counter is updated. The :meth:`flash` method is sometimes called for simple messages. When the process is complete, the :meth:`close` method is called, optionally with a message. """ def __init__( self, *, message: str = "", count: int = 0, total: int = 0, refresh_interval: int = 0, unit: str = "", status: str = "", file: TextIO = sys.stderr, ): self.file = file self.kwargs = { "count": count, "total": total, "refresh_interval": refresh_interval, "message": message, "unit": unit, "status": status, } self._refresh_quota: int = refresh_interval def update(self, n: int = 1, force: bool = False) -> None: """Update the counter with the increment value *n*. This method should update the ``count`` key of :attr:`kwargs` with the increment value *n*. After this, it is expected to update some user-facing progress indicator. If *force* is :python:`True`, any indicator will be refreshed regardless of the value of the refresh interval. """ self.kwargs["count"] += n # type: ignore def set(self, **kwargs) -> None: """Update progress handler parameters. Calling this method also runs :meth:`update` with an increment of 0, which causes a refresh of any indicator without changing the counter. """ self.kwargs.update(**kwargs) self.update(0, force=True) def flash(self, message: str) -> None: """Issue a message unrelated to the current counter. This may be useful for multi-stage processes to indicate the move to a new stage, or to log unexpected situations. """ pass def close(self) -> None: """Close the progress handler. This might be useful for closing file handles or cleaning up resources. """ pass class ProgressBar(ProgressHandler): """A :class:`ProgressHandler` subclass for printing a progress bar. Example: >>> p = ProgressBar(message="Progress: ", total=10, unit=" units") >>> p.update(3) Progress: [######### ] (3/10 units) See :meth:`format` for a description of how the progress bar is formatted. """ #: The default formatting template. FMT = "\r{message}{bar}{counter}{status}" def update(self, n: int = 1, force: bool = False) -> None: """Increment the count by *n* and print the reformatted bar.""" self.kwargs["count"] += n # type: ignore self._refresh_quota -= n if force or self._refresh_quota <= 0: self._refresh_quota = self.kwargs["refresh_interval"] # type: ignore s = self.format() if self.file: print("\r\033[K", end="", file=self.file) print(s, end="", file=self.file) def format(self) -> str: """Format and return the progress bar. The bar is is formatted according to :attr:`FMT`, using variables from :attr:`kwargs` and two computed variables: - ``bar``: visualization of the progress bar, empty when ``total`` is 0 - ``counter``: display of ``count``, ``total``, and ``units`` >>> p = ProgressBar(message="Progress", count=2, total=10, unit="K") >>> p.format() '\\rProgress [###### ] (2/10K) ' >>> p = ProgressBar(count=2, status="Counting...") >>> p.format() '\\r (2) Counting...' """ _kw = self.kwargs width = 30 total: int = _kw["total"] # type: ignore count: int = _kw["count"] # type: ignore if total > 0: num = min(count, total) * width fill = (num // total) * "#" part = ((num % total) * 3) // total if part: fill += "-="[part - 1] bar = f" [{fill:<{width}}]" counter = f" ({count}/{total}{_kw['unit']}) " else: bar = "" counter = f" ({count}{_kw['unit']}) " return self.FMT.format(bar=bar, counter=counter, **_kw) def flash(self, message: str) -> None: """Overwrite the progress bar with *message*.""" print(f"\r\033[K{message}", end="", file=self.file) def close(self) -> None: """Print a newline so the last printed bar remains on screen.""" print(file=self.file) wn-1.0.0/wn/validate.py000066400000000000000000000300261513755206300147330ustar00rootroot00000000000000"""Wordnet lexicon validation. This module is for checking whether the the contents of a lexicon are valid according to a series of checks. Those checks are: ==== ========================================================== Code Message ==== ========================================================== E101 ID is not unique within the lexicon. W201 Lexical entry has no senses. W202 Redundant sense between lexical entry and synset. W203 Redundant lexical entry with the same lemma and synset. E204 Synset of sense is missing. W301 Synset is empty (not associated with any lexical entries). W302 ILI is repeated across synsets. W303 Proposed ILI is missing a definition. W304 Existing ILI has a spurious definition. W305 Synset has a blank definition. W306 Synset has a blank example. W307 Synset repeats an existing definition. E401 Relation target is missing or invalid. W402 Relation type is invalid for the source and target. W403 Redundant relation between source and target. W404 Reverse relation is missing. W501 Synset's part-of-speech is different from its hypernym's. W502 Relation is a self-loop. ==== ========================================================== """ from collections import Counter from collections.abc import ( Callable, Iterator, Sequence, ) from itertools import chain from typing import TypedDict, cast from wn import lmf from wn.constants import ( REVERSE_RELATIONS, SENSE_RELATIONS, SENSE_SYNSET_RELATIONS, SYNSET_RELATIONS, ) from wn.util import ProgressBar, ProgressHandler _Ids = dict[str, Counter] _Result = dict[str, dict] _CheckFunction = Callable[[lmf.Lexicon, _Ids], _Result] class _Check(TypedDict): message: str items: _Result _Report = dict[str, _Check] def _non_unique_id(lex: lmf.Lexicon, ids: _Ids) -> _Result: """ID is not unique within the lexicon""" return _multiples( chain( [lex["id"]], (f["id"] for e in _entries(lex) for f in _forms(e) if f.get("id")), (sb["id"] for sb in lex.get("frames", []) if sb.get("id")), ids["entry"].elements(), ids["sense"].elements(), ids["synset"].elements(), ) ) def _has_no_senses(lex: lmf.Lexicon, ids: _Ids) -> _Result: """lexical entry has no senses""" return {e["id"]: {} for e in _entries(lex) if not _senses(e)} def _redundant_sense(lex: lmf.Lexicon, ids: _Ids) -> _Result: """redundant sense between lexical entry and synset""" result: _Result = {} for e in _entries(lex): redundant = _multiples(s["synset"] for s in _senses(e)) result.update( (s["id"], {"entry": e["id"], "synset": s["synset"]}) for s in _senses(e) if s["synset"] in redundant ) return result def _redundant_entry(lex: lmf.Lexicon, ids: _Ids) -> _Result: """redundant lexical entry with the same lemma and synset""" redundant = _multiples( (e["lemma"]["writtenForm"], s["synset"]) for e in _entries(lex) for s in _senses(e) ) return {form: {"synset": synset} for form, synset in redundant} def _missing_synset(lex: lmf.Lexicon, ids: _Ids) -> _Result: """synset of sense is missing""" synset_ids = ids["synset"] return { s["id"]: {"synset": s["synset"]} for e in _entries(lex) for s in _senses(e) if s["synset"] not in synset_ids } def _empty_synset(lex: lmf.Lexicon, ids: _Ids) -> _Result: """synset is empty (not associated with any lexical entries)""" synsets = {s["synset"] for e in _entries(lex) for s in _senses(e)} return {ss["id"]: {} for ss in _synsets(lex) if ss["id"] not in synsets} def _repeated_ili(lex: lmf.Lexicon, ids: _Ids) -> _Result: """ILI is repeated across synsets""" repeated = _multiples( ss["ili"] for ss in _synsets(lex) if ss["ili"] and ss["ili"] != "in" ) return { ss["id"]: {"ili": ss["ili"]} for ss in _synsets(lex) if ss["ili"] in repeated } def _missing_ili_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result: """proposed ILI is missing a definition""" return { ss["id"]: {} for ss in _synsets(lex) if ss["ili"] == "in" and not ss.get("ili_definition") } def _spurious_ili_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result: """existing ILI has a spurious definition""" return { ss["id"]: {"ili_definitin": ss["ili_definition"]} for ss in _synsets(lex) if ss["ili"] and ss["ili"] != "in" and ss.get("ili_definition") } def _blank_synset_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result: """synset has a blank definition""" return { ss["id"]: {} for ss in _synsets(lex) if any(dfn["text"].strip() == "" for dfn in ss.get("definitions", [])) } def _blank_synset_example(lex: lmf.Lexicon, ids: _Ids) -> _Result: """synset has a blank example""" return { ss["id"]: {} for ss in _synsets(lex) if any(ex["text"].strip() == "" for ex in ss.get("examples", [])) } def _repeated_synset_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result: """synset repeats an existing definition""" repeated = _multiples( dfn["text"] for ss in _synsets(lex) for dfn in ss.get("definitions", []) ) return { ss["id"]: {} for ss in _synsets(lex) if any(dfn["text"] in repeated for dfn in ss.get("definitions", [])) } def _missing_relation_target(lex: lmf.Lexicon, ids: _Ids) -> _Result: """relation target is missing or invalid""" result = { s["id"]: {"type": r["relType"], "target": r["target"]} for s, r in _sense_relations(lex) if r["target"] not in ids["sense"] and r["target"] not in ids["synset"] } result.update( (ss["id"], {"type": r["relType"], "target": r["target"]}) for ss, r in _synset_relations(lex) if r["target"] not in ids["synset"] ) return result def _invalid_relation_type(lex: lmf.Lexicon, ids: _Ids) -> _Result: """relation type is invalid for the source and target""" result = { s["id"]: {"type": r["relType"], "target": r["target"]} for s, r in _sense_relations(lex) if (r["target"] in ids["sense"] and r["relType"] not in SENSE_RELATIONS) or (r["target"] in ids["synset"] and r["relType"] not in SENSE_SYNSET_RELATIONS) } result.update( (ss["id"], {"type": r["relType"], "target": r["target"]}) for ss, r in _synset_relations(lex) if r["relType"] not in SYNSET_RELATIONS ) return result def _redundant_relation(lex: lmf.Lexicon, ids: _Ids) -> _Result: """redundant relation between source and target""" redundant = _multiples( chain( ( (s["id"], r["relType"], r["target"], _get_dc_type(r)) for s, r in _sense_relations(lex) ), ( (ss["id"], r["relType"], r["target"], _get_dc_type(r)) for ss, r in _synset_relations(lex) ), ) ) return { src: ({"type": typ, "target": tgt} | ({"dc:type": dctyp} if dctyp else {})) for src, typ, tgt, dctyp in redundant } def _missing_reverse_relation(lex: lmf.Lexicon, ids: _Ids) -> _Result: """reverse relation is missing""" regular = { (s["id"], r["relType"], r["target"]) for s, r in _sense_relations(lex) if r["target"] in ids["sense"] } regular.update( (ss["id"], r["relType"], r["target"]) for ss, r in _synset_relations(lex) ) return { tgt: {"type": REVERSE_RELATIONS[typ], "target": src} for src, typ, tgt in regular if typ in REVERSE_RELATIONS and (tgt, REVERSE_RELATIONS[typ], src) not in regular } def _hypernym_wrong_pos(lex: lmf.Lexicon, ids: _Ids) -> _Result: """synset's part-of-speech is different from its hypernym's""" sspos = {ss["id"]: ss.get("partOfSpeech") for ss in _synsets(lex)} return { ss["id"]: {"type": r["relType"], "target": r["target"]} for ss, r in _synset_relations(lex) if r["relType"] == "hypernym" and ss.get("partOfSpeech") != sspos[r["target"]] } def _self_loop(lex: lmf.Lexicon, ids: _Ids) -> _Result: """relation is a self-loop""" relations = chain(_sense_relations(lex), _synset_relations(lex)) return { x["id"]: {"type": r["relType"], "target": r["target"]} for x, r in relations if x["id"] == r["target"] } # Helpers def _multiples(iterable): counts = Counter(iterable) return {x: {"count": cnt} for x, cnt in counts.items() if cnt > 1} def _entries(lex: lmf.Lexicon) -> list[lmf.LexicalEntry]: return lex.get("entries", []) def _forms(e: lmf.LexicalEntry) -> list[lmf.Form]: return e.get("forms", []) def _senses(e: lmf.LexicalEntry) -> list[lmf.Sense]: return e.get("senses", []) def _synsets(lex: lmf.Lexicon) -> list[lmf.Synset]: return lex.get("synsets", []) def _sense_relations(lex: lmf.Lexicon) -> Iterator[tuple[lmf.Sense, lmf.Relation]]: for e in _entries(lex): for s in _senses(e): for r in s.get("relations", []): yield (s, r) def _synset_relations(lex: lmf.Lexicon) -> Iterator[tuple[lmf.Synset, lmf.Relation]]: for ss in _synsets(lex): for r in ss.get("relations", []): yield (ss, r) def _get_dc_type(r: lmf.Relation) -> str | None: return (r.get("meta") or {}).get("type") # Check codes and messages # # categories: # E - errors # W - warnings # subcategories: # 100 - general # 200 - words and senses # 300 - synsets and ilis # 400 - relations # 500 - graph and taxonomy _codes: dict[str, _CheckFunction] = { # 100 - general "E101": _non_unique_id, # 200 - words and senses "W201": _has_no_senses, "W202": _redundant_sense, "W203": _redundant_entry, "E204": _missing_synset, # 300 - synsets and ilis "W301": _empty_synset, "W302": _repeated_ili, "W303": _missing_ili_definition, "W304": _spurious_ili_definition, "W305": _blank_synset_definition, "W306": _blank_synset_example, "W307": _repeated_synset_definition, # 400 - relations "E401": _missing_relation_target, "W402": _invalid_relation_type, "W403": _redundant_relation, "W404": _missing_reverse_relation, # 500 - graph "W501": _hypernym_wrong_pos, "W502": _self_loop, } def _select_checks(select: Sequence[str]) -> list[tuple[str, _CheckFunction, str]]: selectset = set(select) return [ (code, func, func.__doc__ or "") for code, func in _codes.items() if code in selectset or code[0] in selectset ] # Main function def validate( lex: lmf.Lexicon | lmf.LexiconExtension, select: Sequence[str] = ("E", "W"), progress_handler: type[ProgressHandler] | None = ProgressBar, ) -> _Report: """Check *lex* for validity and return a report of the results. The *select* argument is a sequence of check codes (e.g., ``E101``) or categories (``E`` or ``W``). The *progress_handler* parameter takes a subclass of :class:`wn.util.ProgressHandler`. An instance of the class will be created, used, and closed by this function. """ if lex.get("extends"): print("validation of lexicon extensions is not supported") return {} lex = cast("lmf.Lexicon", lex) if progress_handler is None: progress_handler = ProgressHandler ids: _Ids = { "entry": Counter(entry["id"] for entry in _entries(lex)), "sense": Counter( sense["id"] for entry in _entries(lex) for sense in _senses(entry) ), "synset": Counter(synset["id"] for synset in _synsets(lex)), } checks = _select_checks(select) progress = progress_handler(message="Validate", total=len(checks)) report: _Report = {} for code, func, message in checks: progress.set( status=getattr(func, "__name__", "(unknown test)").replace("_", " ") ) report[code] = _Check(message=message, items=func(lex, ids)) progress.update() progress.set(status="") progress.close() return report