pax_global_header00006660000000000000000000000064151472313020014510gustar00rootroot0000000000000052 comment=2a0c6bf2a3538ff0233df51ba76b3bfc35623937 hickeroar-simplebayes-4803e70/000077500000000000000000000000001514723130200162465ustar00rootroot00000000000000hickeroar-simplebayes-4803e70/.flake8000066400000000000000000000002411514723130200174160ustar00rootroot00000000000000[flake8] max-line-length = 120 # Test aggregator uses star imports to register test classes for pytest discovery per-file-ignores = tests/test.py: F401,F403 hickeroar-simplebayes-4803e70/.github/000077500000000000000000000000001514723130200176065ustar00rootroot00000000000000hickeroar-simplebayes-4803e70/.github/workflows/000077500000000000000000000000001514723130200216435ustar00rootroot00000000000000hickeroar-simplebayes-4803e70/.github/workflows/test.yml000066400000000000000000000035401514723130200233470ustar00rootroot00000000000000name: Tests and Quality on: push: branches: [master, main] pull_request: branches: [master, main] workflow_dispatch: schedule: - cron: "0 4 * * *" jobs: test-and-coverage: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install -e . pip install -r setup/requirements.dev.txt - name: Run tests with strict coverage run: pytest tests/ --cov=simplebayes --cov-fail-under=100 -v - name: Run flake8 run: flake8 simplebayes tests - name: Run pylint run: pylint simplebayes tests --fail-under=10 api-integration: runs-on: ubuntu-latest needs: test-and-coverage steps: - uses: actions/checkout@v4 - name: Set up Python 3.13 uses: actions/setup-python@v5 with: python-version: "3.13" - name: Install dependencies run: | python -m pip install --upgrade pip pip install -e . pip install -r setup/requirements.dev.txt - name: Run API integration slice run: pytest tests/test_api_endpoints.py tests/test_cli_integration.py -v packaging-smoke: runs-on: ubuntu-latest needs: test-and-coverage steps: - uses: actions/checkout@v4 - name: Set up Python 3.13 uses: actions/setup-python@v5 with: python-version: "3.13" - name: Build package artifacts run: | python -m pip install --upgrade pip pip install build python -m build hickeroar-simplebayes-4803e70/.github/workflows/workflow.yml000066400000000000000000000011161514723130200242370ustar00rootroot00000000000000name: Publish to PyPI on: release: types: [published] push: tags: - "v*" workflow_dispatch: jobs: pypi-publish: runs-on: ubuntu-latest environment: release permissions: id-token: write steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.13" - name: Install build run: pip install build - name: Build package run: python -m build - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 hickeroar-simplebayes-4803e70/.gitignore000066400000000000000000000012741514723130200202420ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .cache nosetests.xml coverage.xml # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ .idea .vagrant MANIFESThickeroar-simplebayes-4803e70/.pre-commit-config.yaml000066400000000000000000000007501514723130200225310ustar00rootroot00000000000000repos: - repo: local hooks: - id: pytest name: pytest entry: pytest tests/ --cov=simplebayes --cov-fail-under=100 -q language: system pass_filenames: false - id: flake8 name: flake8 entry: flake8 simplebayes tests language: system pass_filenames: false - id: pylint name: pylint entry: pylint simplebayes tests --fail-under=10 language: system pass_filenames: false hickeroar-simplebayes-4803e70/CHANGELOG.md000066400000000000000000000140141514723130200200570ustar00rootroot00000000000000# simplebayes Changelog All notable changes to this project are documented here. ## v3.2.0 ### Added - CLI options for `simplebayes-server`: - `--language` – Language code for stemmer and stop words (default: `english`). Environment: `SIMPLEBAYES_LANGUAGE`. - `--remove-stop-words` – Filter common stop words. Environment: `SIMPLEBAYES_REMOVE_STOP_WORDS` (`1`, `true`, `yes` = enabled). - `--verbose` – Log requests, responses, and classifier operations to stderr. Environment: `SIMPLEBAYES_VERBOSE` (`1`, `true`, `yes` = enabled). - Verbose mode: request/response middleware logs method, path, Content-Length, status code, and body preview (truncated at 500 chars) to stderr with `[simplebayes]` prefix. - Verbose mode: classifier insight for each endpoint – tokens extracted, category operations, scores, and summaries (token lists truncated at 20 items). - `create_app(language, remove_stop_words, verbose)` – API app factory now accepts classifier and logging options. - `--help` documents all CLI arguments. ### Changed - API classifier is now configured from CLI `--language` and `--remove-stop-words` instead of using fixed defaults. - README: CLI options table, environment variable equivalents, and Verbose mode subsection. ## v3.1.1 ### Changed - Documentation and metadata: updated terminology from "Bayes" to "Bayesian" in PyPI keywords and changelog for consistency with proper nomenclature. ## v3.1.0 ### Added - Classifier options for `SimpleBayes`: - `alpha` – Laplace smoothing. Use `0.01` or `1.0` to avoid zero probabilities for tokens unseen in a category; improves handling of sparse vocabularies. Default `0` preserves prior behavior. - `language` – Language code for stemmer and stop words (e.g. `"english"`, `"spanish"`, `"french"`). All Snowball languages supported. Default `"english"`. - `remove_stop_words` – Filter common stop words when `True`. Default `False` for backwards compatibility. - Built-in stopword lists for every Snowball language (Arabic, Armenian, Basque, Catalan, Danish, Dutch, English, Esperanto, Estonian, Finnish, French, German, Greek, Hindi, Hungarian, Indonesian, Irish, Italian, Lithuanian, Nepali, Norwegian, Portuguese, Romanian, Russian, Serbian, Spanish, Swedish, Tamil, Turkish, Yiddish). No download or file storage required. - `create_tokenizer(language, remove_stop_words)` – factory for language-aware tokenizers. - API: Bearer auth integrated with OpenAPI docs. `/docs` and `/redoc` expose the Bearer scheme; use the "Authorize" button in Swagger UI for interactive testing. - `UnauthorizedError` – domain exception for Bearer auth failures; produces 401 with `WWW-Authenticate` header. ### Changed - Tokenizer pipeline: `language` drives both stemming and stop-word filtering. When `remove_stop_words=True`, stop words are filtered after stemming. - Laplace smoothing applied in probability calculations when `alpha > 0`. - API routes refactored to use FastAPI `Depends` for auth, classifier, and readiness state. - README: Classifier Options table, Tokenization section, Bearer auth docs. - Pylint: `--fail-under=10` in CONTRIBUTING and README. ## v3.0.0 ### Breaking - Introduced a full HTTP API runtime with CLI entrypoint and expanded package layout. - Added typed classification/result contracts and stricter category validation semantics. - Added versioned JSON model persistence APIs (`save`/`load`, `save_to_file`/`load_from_file`) with validation and atomic file writes. - Removed legacy pickle persistence APIs (`cache_train`, `cache_persist`, `get_cache_location`) in favor of JSON-only persistence. - Removed legacy `SimpleBayes` constructor cache arguments (`cache_path`, `cache_file`). - `/classify` now returns `category: null` when no category can be selected. ### Added - FastAPI API surface: - `/info` - `/train/{category}` - `/untrain/{category}` - `/classify` - `/score` - `/flush` - `/healthz` - `/readyz` - Optional bearer token protection for all non-probe endpoints. - 1 MiB request body guardrails for text endpoints. - Readiness lifecycle state with drain behavior on shutdown. - CLI runner (`simplebayes-server`) with host/port/auth-token options and env var support. - Dedicated tokenizer pipeline with Unicode normalization, lowercasing, non-word splitting, and English stemming. - Thread-safe classifier state handling and concurrency stress tests. - New docs set: - rewritten README with API and operational guidance - CONTRIBUTING guide ### Changed - CI now includes expanded quality lanes: - strict 100% coverage gate - API integration slice - packaging/build smoke lane - scheduled/manual workflow triggers ## v2.1.0 ### Added - Release bump to 2.1.0. ### Changed - Cache path handling and docs cleanup improvements. - Repository/license housekeeping updates. ## v2.0.0 ### Added - Modernized project/tooling baseline for current Python versions. - Added PyPI publish workflow. - Added `.flake8` handling updates for test/lint compatibility. ### Changed - Refreshed release and packaging workflow. ## v1.5.8 ### Changed - No code delta from v1.5.7 (tag alignment release marker). ## v1.5.7 ### Added - Expanded docs generation and API documentation links. ### Fixed - Critical scoring behavior after cache reload. - Miscellaneous documentation and docstring fixes. ### Changed - README and docs hosting updates. ## v1.5.5 ### Changed - Updated licensing metadata and README license coverage. ## v1.5.4 ### Changed - Reverted project URL configuration update. ## v1.5.3 ### Changed - Renamed internal function(s) for readability. - Version metadata bump. ## v1.5.2 ### Added - Added distribution script and moved packaging flow to setuptools. ## v1.5.1 ### Added - Initial classifier core with train/untrain/score/classify behavior. - Basic packaging and setup metadata. - Persistence support and accompanying unit tests. - Full test coverage wiring and early build checks. ### Changed - Bayesian scoring approach rewrite and performance optimizations. - Python 2/3 compatibility updates during early lifecycle. - Documentation, README, and homepage iterations. hickeroar-simplebayes-4803e70/CONTRIBUTING.md000066400000000000000000000015141514723130200205000ustar00rootroot00000000000000# Contributing ## Local development checks Run these before opening a PR: ```sh ./.venv/bin/pytest tests/ --cov=simplebayes --cov-fail-under=100 -v ./.venv/bin/flake8 simplebayes tests ./.venv/bin/pylint simplebayes tests --fail-under=10 ``` Optional but recommended: ```sh ./.venv/bin/pytest tests/test_api_endpoints.py -v ./.venv/bin/pytest tests/test_concurrency.py -v ``` ## CI parity CI should run: - full tests with 100% coverage gate - lint checks - packaging/build validation When API behavior changes, include endpoint contract tests for status codes and payloads. ## Release and versioning - Use semantic version tags (for example, `v2.2.0`). - Keep backward compatibility unless intentionally releasing a breaking change. - If breaking API or contract changes are introduced, bump the major version before release tagging. hickeroar-simplebayes-4803e70/LICENSE000066400000000000000000000020671514723130200172600ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2026 Ryan Vennell Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. hickeroar-simplebayes-4803e70/MANIFEST.in000066400000000000000000000000411514723130200177770ustar00rootroot00000000000000include README.md include LICENSEhickeroar-simplebayes-4803e70/README.md000066400000000000000000000201441514723130200175260ustar00rootroot00000000000000# simplebayes A memory-based, optional-persistence naive Bayesian text classification package and web API for Python. --- ## Why? ``` Bayesian text classification is useful for things like spam detection, sentiment determination, and general category routing. You gather representative samples for each category, train the model, then classify new text based on learned token patterns. Once the model is trained, you can: - classify input into a best-fit category - inspect relative per-category scores - persist and reload model state ``` ## Installation Requires Python 3.10 or newer. ``` $ git clone https://github.com/hickeroar/simplebayes.git $ cd simplebayes $ python3 -m venv .venv $ source .venv/bin/activate $ pip install -e . ``` If you only want to use simplebayes as a library: ``` $ pip install simplebayes ``` --- ## Run as an API Server ``` $ simplebayes-server --port 8000 ``` CLI options: ``` --host Host interface to bind. (default: 0.0.0.0) --port Port to bind. (default: 8000) --auth-token Optional bearer token for non-probe endpoints. --language Language code for stemmer and stop words. (default: english) --remove-stop-words Filter common stop words (the, is, and, etc.). --verbose Log requests, responses, and classifier operations to stderr. --help Show all options. ``` Environment variable equivalents: ``` SIMPLEBAYES_HOST SIMPLEBAYES_PORT SIMPLEBAYES_AUTH_TOKEN SIMPLEBAYES_LANGUAGE SIMPLEBAYES_REMOVE_STOP_WORDS (1, true, yes = enabled) SIMPLEBAYES_VERBOSE (1, true, yes = enabled) ``` ### Verbose mode When `--verbose` is set, the server logs each request and response to stderr, plus classifier insight: tokens extracted, category operations, scores, and summaries. Example: ``` $ simplebayes-server --port 8000 --verbose ``` When `--auth-token` is configured, all API endpoints except `/healthz` and `/readyz` require: ``` Authorization: Bearer ``` The API uses HTTP Bearer authentication. When auth is enabled, OpenAPI docs at `/docs` and `/redoc` expose the Bearer scheme; use the "Authorize" button in Swagger UI to set the token for interactive testing. ## Use as a Library in Your App Import and create a classifier: ```python from simplebayes import SimpleBayes classifier = SimpleBayes() # Optional: SimpleBayes(alpha=0.01, language="english", remove_stop_words=True) to filter stop words classifier.train("spam", "buy now limited offer click here") classifier.train("ham", "team meeting schedule for tomorrow") classification = classifier.classify_result("limited offer today") print(f"category={classification.category} score={classification.score}") scores = classifier.score("team schedule update") print(scores) classifier.untrain("spam", "buy now limited offer click here") ``` Persistence example: ```python from simplebayes import SimpleBayes classifier = SimpleBayes() classifier.train("spam", "buy now limited offer click here") classifier.save_to_file("/tmp/simplebayes-model.json") loaded = SimpleBayes() loaded.load_from_file("/tmp/simplebayes-model.json") print(loaded.classify_result("limited offer today")) ``` Custom options example: ```python # Laplace smoothing for better handling of unseen tokens classifier = SimpleBayes(alpha=0.01) # Spanish text with Spanish stemmer and stop words classifier = SimpleBayes(language="spanish", remove_stop_words=True) # Opt-in stop-word removal classifier = SimpleBayes(remove_stop_words=True) ``` Notes for library usage: - Classifier operations are thread-safe. - Scores are relative values; compare scores within the same model. - Category names accepted by `train`/`untrain` match `^[-_A-Za-z0-9]{1,64}$`. ### Classifier Options | Parameter | Default | Description | | --- | --- | --- | | `tokenizer` | built-in | Override with a callable `(str) -> list[str]`. | | `alpha` | `0.0` | Laplace smoothing. Use `0.01` or `1.0` to avoid zero probabilities for tokens unseen in a category; improves handling of sparse vocabularies. | | `language` | `"english"` | Language code for both the Snowball stemmer and built-in stop words. Supported: `arabic`, `armenian`, `basque`, `catalan`, `danish`, `dutch`, `english`, `esperanto`, `estonian`, `finnish`, `french`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `lithuanian`, `nepali`, `norwegian`, `portuguese`, `romanian`, `russian`, `serbian`, `spanish`, `swedish`, `tamil`, `turkish`, `yiddish`. | | `remove_stop_words` | `False` | Filter common stop words when `True` (the, is, and, etc.). Default `False` for backwards compatibility. | ### Tokenization Default tokenization (when no custom `tokenizer` is provided): 1. Unicode NFKC normalization and lowercasing 2. Split on non-word characters 3. Snowball stemming (language from `language` param) 4. Stop-word removal when `remove_stop_words=True` The `language` parameter drives both stemming and stop-word filtering. Built-in stopword lists are included for all supported languages: arabic, armenian, basque, catalan, danish, dutch, english, esperanto, estonian, finnish, french, german, greek, hindi, hungarian, indonesian, irish, italian, lithuanian, nepali, norwegian, portuguese, romanian, russian, serbian, spanish, swedish, tamil, turkish, yiddish. No download or file storage required. Stream APIs are available: - `save(stream)` - `load(stream)` File API notes: - `save_to_file("")` and `load_from_file("")` use `/tmp/simplebayes-model.json`. - Provided file paths must be absolute. ## Development Checks ``` $ ./.venv/bin/pytest tests/ --cov=simplebayes --cov-fail-under=100 -v $ ./.venv/bin/flake8 simplebayes tests $ ./.venv/bin/pylint simplebayes tests --fail-under=10 ``` --- ## Using the HTTP API ### API Notes - Category names in `/train/{category}` and `/untrain/{category}` must match `^[-_A-Za-z0-9]{1,64}$`. - Request body size is capped at 1 MiB on text endpoints. - Error responses for auth/size/encoding are JSON: - `{"error":"unauthorized"}` - `{"error":"request body too large"}` - `{"error":"invalid utf-8 payload"}` - The HTTP service stores classifier state in memory; process restarts clear training data. ### Common Error Responses | Status | When | | --- | --- | | `401` | Missing/invalid bearer token when auth is enabled | | `405` | Wrong HTTP method | | `400` | Request body contains invalid UTF-8 | | `413` | Request body exceeds 1 MiB | | `422` | Invalid category route format | ### Training the Classifier ##### Endpoint: ``` /train/{category} Example: /train/spam Accepts: POST Body: raw text/plain ``` Example: ```bash curl -s -X POST "http://localhost:8000/train/spam" \ -H "Content-Type: text/plain" \ --data "buy now limited offer click here" ``` ### Untraining the Classifier ##### Endpoint: ``` /untrain/{category} Example: /untrain/spam Accepts: POST Body: raw text/plain ``` ### Getting Classifier Status ##### Endpoint: ``` /info Accepts: GET ``` Example response: ```json { "categories": { "spam": { "tokenTally": 6, "probNotInCat": 0, "probInCat": 1 } } } ``` ### Classifying Text ##### Endpoint: ``` /classify Accepts: POST Body: raw text/plain ``` Example response: ```json { "category": "spam", "score": 3.2142857142857144 } ``` If no category can be selected (for example, untrained model), `category` is returned as `null`. ### Scoring Text ##### Endpoint: ``` /score Accepts: POST Body: raw text/plain ``` Example response: ```json { "spam": 3.2142857142857144, "ham": 0.7857142857142857 } ``` ### Flushing Training Data ##### Endpoint: ``` /flush Accepts: POST Body: raw text/plain (optional) ``` Example response: ```json { "success": true, "categories": {} } ``` ### Health and Readiness ##### Liveness endpoint ``` /healthz Accepts: GET ``` ##### Readiness endpoint ``` /readyz Accepts: GET ``` `/healthz` and `/readyz` are intentionally unauthenticated even when API auth is enabled. ## Operational Notes - The HTTP server is in-memory by default; deploys/restarts wipe trained state. - Use `save_to_file` and `load_from_file` in library workflows to persist/reload model state. - `/readyz` returns `200` while accepting traffic and `503` when draining during shutdown. ## License MIT, see `LICENSE`. hickeroar-simplebayes-4803e70/docs/000077500000000000000000000000001514723130200171765ustar00rootroot00000000000000hickeroar-simplebayes-4803e70/docs/Makefile000066400000000000000000000164051514723130200206440ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/simplebayes.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/simplebayes.qhc" applehelp: $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp @echo @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." @echo "N.B. You won't be able to view it unless you put it in" \ "~/Library/Documentation/Help or install it in your application" \ "bundle." devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/simplebayes" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/simplebayes" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." coverage: $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." hickeroar-simplebayes-4803e70/docs/conf.py000066400000000000000000000026521514723130200205020ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinx.ext.viewcode', ] templates_path = ['_templates'] source_suffix = '.rst' master_doc = 'index' project = 'simplebayes' copyright = '2026, Ryan Vennell' author = 'Ryan Vennell' try: import simplebayes version = getattr(simplebayes, '__version__', '3.2.0') except ImportError: version = '3.2.0' release = version language = 'en' exclude_patterns = ['_build'] add_function_parentheses = True add_module_names = True show_authors = False pygments_style = 'sphinx' todo_include_todos = True html_theme = 'sphinx_rtd_theme' html_static_path = ['_static'] htmlhelp_basename = 'simplebayesdoc' latex_elements = {} latex_documents = [ ( master_doc, 'simplebayes.tex', 'simplebayes Documentation', author, 'manual', ), ] man_pages = [ (master_doc, 'simplebayes', 'simplebayes Documentation', [author], 1), ] texinfo_documents = [ ( master_doc, 'simplebayes', 'simplebayes Documentation', author, 'simplebayes', 'One line description of project.', 'Miscellaneous', ), ] epub_title = project epub_author = author epub_publisher = author epub_copyright = copyright epub_exclude_files = ['search.html'] hickeroar-simplebayes-4803e70/docs/index.rst000066400000000000000000000007051514723130200210410ustar00rootroot00000000000000.. simplebayes documentation master file, created by sphinx-quickstart on Wed Apr 8 21:11:57 2015. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to simplebayes's documentation! ======================================= Contents: .. toctree:: :maxdepth: 4 simplebayes Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` hickeroar-simplebayes-4803e70/docs/make.bat000066400000000000000000000161261514723130200206110ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=_build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . set I18NSPHINXOPTS=%SPHINXOPTS% . if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled echo. coverage to run coverage check of the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) REM Check if sphinx-build is available and fallback to Python version if any %SPHINXBUILD% 2> nul if errorlevel 9009 goto sphinx_python goto sphinx_ok :sphinx_python set SPHINXBUILD=python -m sphinx.__init__ %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) :sphinx_ok if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\simplebayes.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\simplebayes.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "coverage" ( %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage if errorlevel 1 exit /b 1 echo. echo.Testing of coverage in the sources finished, look at the ^ results in %BUILDDIR%/coverage/python.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end hickeroar-simplebayes-4803e70/docs/simplebayes.categories.rst000066400000000000000000000002351514723130200243710ustar00rootroot00000000000000simplebayes.categories module ============================= .. automodule:: simplebayes.categories :members: :undoc-members: :show-inheritance: hickeroar-simplebayes-4803e70/docs/simplebayes.category.rst000066400000000000000000000002271514723130200240620ustar00rootroot00000000000000simplebayes.category module =========================== .. automodule:: simplebayes.category :members: :undoc-members: :show-inheritance: hickeroar-simplebayes-4803e70/docs/simplebayes.rst000066400000000000000000000003671514723130200222530ustar00rootroot00000000000000simplebayes package =================== Submodules ---------- .. toctree:: simplebayes.categories simplebayes.category Module contents --------------- .. automodule:: simplebayes :members: :undoc-members: :show-inheritance: hickeroar-simplebayes-4803e70/pylintrc000066400000000000000000000006611514723130200200400ustar00rootroot00000000000000[MASTER] ignore=CVS persistent=no [MESSAGES CONTROL] disable=locally-disabled,missing-module-docstring,missing-function-docstring,too-many-public-methods,wrong-import-order,no-member [FORMAT] max-line-length=120 ignore-long-lines=^\s*(# )??$ [VARIABLES] dummy-variables-rgx=^(_|dummy)$ [LOGGING] logging-modules=logging [MISCELLANEOUS] notes=FIXME,XXX,TODO [EXCEPTIONS] overgeneral-exceptions=builtins.Exception hickeroar-simplebayes-4803e70/pyproject.toml000066400000000000000000000030131514723130200211570ustar00rootroot00000000000000[build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] name = "simplebayes" version = "3.2.0" description = "A memory-based, optional-persistence naïve bayesian text classifier." readme = "README.md" license = "MIT" license-files = ["LICENSE"] requires-python = ">=3.10" authors = [ {name = "Ryan Vennell", email = "ryan.vennell@gmail.com"} ] keywords = ["bayesian", "classifier", "naive", "text", "spam"] dependencies = [ "fastapi>=0.116.1", "snowballstemmer>=3.0.1", "uvicorn[standard]>=0.35.0", ] classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Topic :: Utilities", ] [project.scripts] simplebayes-server = "simplebayes.cli:run" [project.urls] Homepage = "https://github.com/hickeroar/simplebayes" Repository = "https://github.com/hickeroar/simplebayes" Issues = "https://github.com/hickeroar/simplebayes/issues" [tool.pytest.ini_options] testpaths = ["tests"] addopts = "-v" python_files = ["test_*.py"] python_functions = ["test_*"] [tool.coverage.run] source = ["simplebayes"] branch = true [tool.coverage.report] fail_under = 100 show_missing = true [tool.setuptools.packages.find] include = ["simplebayes", "simplebayes.*"] [tool.setuptools.package-data] simplebayes = ["py.typed"] hickeroar-simplebayes-4803e70/setup.py000066400000000000000000000002011514723130200177510ustar00rootroot00000000000000from setuptools import setup # Keep setup.py as a compatibility shim. # Project metadata is defined in pyproject.toml. setup() hickeroar-simplebayes-4803e70/setup/000077500000000000000000000000001514723130200174065ustar00rootroot00000000000000hickeroar-simplebayes-4803e70/setup/distribute.sh000077500000000000000000000003271514723130200221250ustar00rootroot00000000000000#!/usr/bin/env bash # Build and upload to PyPI using twine (modern approach) # Requires: pip install build twine set -e echo "Building package..." python -m build echo "Uploading to PyPI..." twine upload dist/* hickeroar-simplebayes-4803e70/setup/requirements.dev.txt000066400000000000000000000000461514723130200234470ustar00rootroot00000000000000pytest pytest-cov httpx flake8 pylint hickeroar-simplebayes-4803e70/simplebayes/000077500000000000000000000000001514723130200205635ustar00rootroot00000000000000hickeroar-simplebayes-4803e70/simplebayes/__init__.py000066400000000000000000000342611514723130200227020ustar00rootroot00000000000000# coding: utf-8 __version__ = '3.2.0' import threading from collections import Counter from typing import Callable, Dict, List, Optional from simplebayes.categories import BayesCategories from simplebayes.constants import CATEGORY_PATTERN from simplebayes.errors import InvalidCategoryError from simplebayes.models import CategorySummary, ClassificationResult from simplebayes.persistence import ( PERSISTED_MODEL_VERSION, dump_model_state, load_model_state, load_model_state_from_file, save_model_state_to_file, validate_model_state, ) from simplebayes.tokenization import create_tokenizer, default_tokenize_text __all__ = ['SimpleBayes'] class SimpleBayes: """A memory-based, optional-persistence naïve bayesian text classifier.""" def __init__( self, tokenizer: Optional[Callable[[str], List[str]]] = None, alpha: float = 0.0, language: str = "english", remove_stop_words: bool = False, ) -> None: """ :param tokenizer: A tokenizer override. When None, uses built-in tokenizer. :param alpha: Laplace smoothing parameter. Use > 0 (e.g. 0.01 or 1.0) to avoid zero probabilities for tokens unseen in a category. Default 0 preserves prior behavior. :param language: Language code for stemmer and stop words (e.g. "english", "spanish"). Default "english". :param remove_stop_words: If True, filter stop words. Default False (backwards compatible). """ self.categories = BayesCategories() self.tokenizer = ( tokenizer or create_tokenizer(language=language, remove_stop_words=remove_stop_words) ) self.alpha = alpha self.probabilities = {} self._lock = threading.RLock() @classmethod def tokenize_text(cls, text: str) -> List[str]: """ Default tokenize method; can be overridden :param text: the text we want to tokenize :type text: str :return: list of tokenized text :rtype: list """ return default_tokenize_text(text) @classmethod def count_token_occurrences(cls, words: List[str]) -> Dict[str, int]: """ Creates a key/value set of word/count for a given sample of text :param words: full list of all tokens, non-unique :type words: list :return: key/value pairs of words and their counts in the list :rtype: dict """ return dict(Counter(words)) def flush(self) -> None: """ Deletes all tokens & categories """ with self._lock: self.categories = BayesCategories() self.probabilities = {} def calculate_category_probability(self) -> None: """ Caches the individual probabilities for each category """ with self._lock: total_tally = 0.0 probs = {} for category, bayes_category in \ self.categories.get_categories().items(): count = bayes_category.get_tally() total_tally += count probs[category] = count # Calculating the probability for category, count in probs.items(): if total_tally > 0: probs[category] = float(count)/float(total_tally) else: probs[category] = 0.0 new_probabilities = {} for category, probability in probs.items(): new_probabilities[category] = { # Probability that any given token is of this category 'prc': probability, # Probability that any given token is not of this category 'prnc': 1.0 - probability } self.probabilities = new_probabilities def train(self, category: str, text: str) -> None: """ Trains a category with a sample of text :param category: the name of the category we want to train :type category: str :param text: the text we want to train the category with :type text: str """ category = self.normalize_category(category) with self._lock: try: bayes_category = self.categories.get_category(category) except KeyError: bayes_category = self.categories.add_category(category) tokens = self.tokenizer(str(text)) occurrence_counts = self.count_token_occurrences(tokens) for word, count in occurrence_counts.items(): bayes_category.train_token(word, count) # Updating our per-category overall probabilities self.calculate_category_probability() def untrain(self, category: str, text: str) -> None: """ Untrains a category with a sample of text :param category: the name of the category we want to train :type category: str :param text: the text we want to untrain the category with :type text: str """ category = self.normalize_category(category) with self._lock: try: bayes_category = self.categories.get_category(category) except KeyError: return tokens = self.tokenizer(str(text)) occurrence_counts = self.count_token_occurrences(tokens) for word, count in occurrence_counts.items(): bayes_category.untrain_token(word, count) if bayes_category.get_tally() == 0: self.categories.delete_category(category) # Updating our per-category overall probabilities self.calculate_category_probability() def classify(self, text: str) -> Optional[str]: """ Chooses the highest scoring category for a sample of text :param text: sample text to classify :type text: str :return: the "winning" category :rtype: str """ with self._lock: score = self.score(text) highest_category, _ = self._find_highest_category(score) return highest_category def classify_result(self, text: str) -> ClassificationResult: """ Returns structured classification output including score. """ with self._lock: scores = self.score(text) highest_category, highest_score = self._find_highest_category(scores) return ClassificationResult(category=highest_category or None, score=highest_score) @classmethod def _find_highest_category(cls, scores: Dict[str, float]) -> tuple[Optional[str], float]: if not scores: return None, 0.0 highest_category = None highest_score = 0.0 for category in sorted(scores.keys()): category_score = float(scores[category]) if category_score > highest_score: highest_score = category_score highest_category = category return highest_category, highest_score def score(self, text: str) -> Dict[str, float]: """ Scores a sample of text :param text: sample text to score :type text: str :return: dict of scores per category :rtype: dict """ with self._lock: occurs = self.count_token_occurrences(self.tokenizer(text)) scores = {} for category in self.categories.get_categories(): scores[category] = 0 categories = self.categories.get_categories().items() for word, count in occurs.items(): token_scores = {} # Adding up individual token scores for category, bayes_category in categories: token_scores[category] = \ float(bayes_category.get_token_count(word)) # We use this to get token-in-category probabilities token_tally = sum(token_scores.values()) # If this token isn't found anywhere its probability is 0 if token_tally == 0.0: continue # Calculating bayes probability for this token # http://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering for category, token_score in token_scores.items(): # Bayes probability * the number of occurrences of this token scores[category] += count * \ self.calculate_bayesian_probability( category, token_score, token_tally ) # Removing empty categories from the results final_scores = {} for category, score in scores.items(): if score > 0: final_scores[category] = score return final_scores def calculate_bayesian_probability( self, cat: str, token_score: float, token_tally: float ) -> float: """ Calculates the bayesian probability for a given token/category :param cat: The category we're scoring for this token :type cat: str :param token_score: The tally of this token for this category :type token_score: float :param token_tally: The tally total for this token from all categories :type token_tally: float :return: bayesian probability :rtype: float """ # P that any given token IS in this category prc = self.probabilities[cat]['prc'] # P that any given token is NOT in this category prnc = self.probabilities[cat]['prnc'] # Laplace smoothing: add alpha to avoid zero probabilities # (token_in_cat, token_not_in_cat) -> k=2 for binary view per token if self.alpha > 0: prtc = (token_score + self.alpha) / (token_tally + 2.0 * self.alpha) prtnc = (token_tally - token_score + self.alpha) / ( token_tally + 2.0 * self.alpha ) else: prtnc = (token_tally - token_score) / token_tally prtc = token_score / token_tally # Assembling the parts of the bayes equation numerator = prtc * prc denominator = numerator + (prtnc * prnc) # Returning the calculated bayes probability unless the denom. is 0 return numerator / denominator if denominator != 0.0 else 0.0 def tally(self, category: str) -> int: """ Gets the tally for a requested category :param category: The category we want a tally for :type category: str :return: tally for a given category :rtype: int """ with self._lock: try: bayes_category = self.categories.get_category(category) except KeyError: return 0 return bayes_category.get_tally() def get_summaries(self) -> Dict[str, CategorySummary]: """ Returns per-category summary details. """ with self._lock: summaries: Dict[str, CategorySummary] = {} categories = self.categories.get_categories() for category_name, category in categories.items(): category_probability = self.probabilities.get( category_name, {'prc': 0.0, 'prnc': 0.0}, ) summaries[category_name] = CategorySummary( token_tally=category.get_tally(), prob_in_cat=float(category_probability['prc']), prob_not_in_cat=float(category_probability['prnc']), ) return summaries def save(self, destination) -> None: """ Saves classifier state to a text stream. """ with self._lock: dump_model_state(destination, self._export_model_state()) def load(self, source) -> None: """ Loads classifier state from a text stream. """ with self._lock: state = load_model_state(source) validate_model_state(state) self._apply_model_state(state) def save_to_file(self, absolute_path: str = "") -> None: """ Saves classifier state to file using atomic replacement. """ with self._lock: save_model_state_to_file(absolute_path, self._export_model_state()) def load_from_file(self, absolute_path: str = "") -> None: """ Loads classifier state from a persisted model file. """ with self._lock: state = load_model_state_from_file(absolute_path) validate_model_state(state) self._apply_model_state(state) @classmethod def normalize_category(cls, category: str | None) -> str: """ Validates and normalizes category input. """ if category is None: raise InvalidCategoryError("category is required") normalized = str(category).strip() if not CATEGORY_PATTERN.match(normalized): raise InvalidCategoryError( "category must be 1-64 chars and only include letters, numbers, underscore, or hyphen", ) return normalized def _export_model_state(self) -> Dict: categories = {} for category_name, category in self.categories.get_categories().items(): category_tokens = { token: int(count) for token, count in category.tokens.items() if count > 0 } categories[category_name] = { "tally": int(category.get_tally()), "tokens": category_tokens, } return { "version": PERSISTED_MODEL_VERSION, "categories": categories, } def _apply_model_state(self, state: Dict) -> None: self.categories = BayesCategories() categories = state.get("categories", {}) for category_name, category_state in categories.items(): category = self.categories.add_category(category_name) for token, count in category_state["tokens"].items(): category.train_token(token, count) self.calculate_category_probability() hickeroar-simplebayes-4803e70/simplebayes/api/000077500000000000000000000000001514723130200213345ustar00rootroot00000000000000hickeroar-simplebayes-4803e70/simplebayes/api/app.py000066400000000000000000000051201514723130200224640ustar00rootroot00000000000000import sys from contextlib import asynccontextmanager from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response from simplebayes import SimpleBayes from simplebayes.api.routes import WWW_AUTH_HEADER, create_router from simplebayes.errors import UnauthorizedError from simplebayes.runtime.readiness import ReadinessState def create_app( auth_token: str = "", language: str = "english", remove_stop_words: bool = False, verbose: bool = False, ) -> FastAPI: classifier = SimpleBayes(language=language, remove_stop_words=remove_stop_words) readiness = ReadinessState() @asynccontextmanager async def lifespan(_app: FastAPI): readiness.mark_ready() yield readiness.mark_not_ready() app = FastAPI(title="simplebayes", lifespan=lifespan) app.state.classifier = classifier app.state.readiness = readiness app.state.verbose = verbose app.include_router(create_router(auth_token=auth_token, verbose=verbose)) @app.middleware("http") async def verbose_middleware(request: Request, call_next): if not request.app.state.verbose: return await call_next(request) method = request.method path = request.url.path content_length = request.headers.get("content-length", "") parts = [f"[simplebayes] {method} {path}"] if content_length: parts.append(f" (Content-Length: {content_length})") print("".join(parts), file=sys.stderr) try: response = await call_next(request) except Exception: print("[simplebayes] -> (exception)", file=sys.stderr) raise body = b"" async for chunk in response.body_iterator: body += chunk preview_len = 500 if len(body) > preview_len: body_preview = body[:preview_len].decode("utf-8", errors="replace") + "..." else: body_preview = body.decode("utf-8", errors="replace") if body else "" print(f"[simplebayes] -> {response.status_code} {body_preview!r}", file=sys.stderr) return Response( content=body, status_code=response.status_code, headers=dict(response.headers), media_type=response.media_type, ) @app.exception_handler(UnauthorizedError) def unauthorized_handler(_request: Request, _exc: UnauthorizedError) -> JSONResponse: return JSONResponse( status_code=401, content={"error": "unauthorized"}, headers=WWW_AUTH_HEADER, ) return app app = create_app() hickeroar-simplebayes-4803e70/simplebayes/api/routes.py000066400000000000000000000167311514723130200232370ustar00rootroot00000000000000import sys import secrets from typing import Dict from fastapi import APIRouter, Body, Depends, Path, Request from fastapi.responses import JSONResponse from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from simplebayes import SimpleBayes from simplebayes.errors import UnauthorizedError from simplebayes.runtime.readiness import ReadinessState from simplebayes.api.schemas import ( CategorySummaryResponse, ClassificationResponse, InfoResponse, MutationResponse, ) CATEGORY_REGEX = r"^[-_A-Za-z0-9]{1,64}$" def _get_classifier(request: Request) -> SimpleBayes: return request.app.state.classifier def _get_readiness(request: Request) -> ReadinessState: return request.app.state.readiness def _log_verbose(request: Request, *parts: str) -> None: """Log to stderr when verbose mode is enabled.""" if getattr(request.app.state, "verbose", False): print("[simplebayes]", *parts, file=sys.stderr) def _format_tokens(tokens: list) -> str: """Format token list, truncating if long.""" max_show = 20 if len(tokens) <= max_show: return str(tokens) return str(tokens[:max_show]) + "..." MAX_REQUEST_BODY_BYTES = 1024 * 1024 WWW_AUTH_HEADER = {"WWW-Authenticate": 'Bearer realm="simplebayes"'} def _map_summaries(classifier: SimpleBayes) -> Dict[str, CategorySummaryResponse]: summaries = classifier.get_summaries() return { category: CategorySummaryResponse( tokenTally=summary.token_tally, probNotInCat=summary.prob_not_in_cat, probInCat=summary.prob_in_cat, ) for category, summary in summaries.items() } def _create_auth_dependency(auth_token: str): """Returns a FastAPI dependency for Bearer auth. When auth_token is empty, no auth.""" bearer = HTTPBearer(auto_error=False) def verify( credentials: HTTPAuthorizationCredentials | None = Depends(bearer), ) -> None: if not auth_token: return if credentials is None: raise UnauthorizedError() if not secrets.compare_digest(credentials.credentials, auth_token): raise UnauthorizedError() return verify def _parse_payload(payload: bytes) -> tuple[str, JSONResponse | None]: if len(payload) > MAX_REQUEST_BODY_BYTES: return "", JSONResponse( status_code=413, content={"error": "request body too large"}, ) try: return payload.decode("utf-8"), None except UnicodeDecodeError: return "", JSONResponse( status_code=400, content={"error": "invalid utf-8 payload"}, ) def create_router(auth_token: str = "", verbose: bool = False) -> APIRouter: router = APIRouter() verify_auth = _create_auth_dependency(auth_token) @router.get("/info", response_model=InfoResponse) def info( request: Request, _auth: None = Depends(verify_auth), classifier: SimpleBayes = Depends(_get_classifier), ): result = InfoResponse(categories=_map_summaries(classifier)) _log_verbose( request, "info:", "categories=", str(list(result.categories.keys())), ) return result @router.post("/train/{category}", response_model=MutationResponse) def train( request: Request, _auth: None = Depends(verify_auth), classifier: SimpleBayes = Depends(_get_classifier), category: str = Path(..., pattern=CATEGORY_REGEX), payload: bytes = Body(b"", media_type="text/plain"), ): text, payload_response = _parse_payload(payload) if payload_response is not None: return payload_response tokens = classifier.tokenizer(text) classifier.train(category, text) summaries = _map_summaries(classifier) _log_verbose( request, "train:", "category=", category, "tokens=", _format_tokens(tokens), "summaries=", str({k: v.tokenTally for k, v in summaries.items()}), ) return MutationResponse(success=True, categories=summaries) @router.post("/untrain/{category}", response_model=MutationResponse) def untrain( request: Request, _auth: None = Depends(verify_auth), classifier: SimpleBayes = Depends(_get_classifier), category: str = Path(..., pattern=CATEGORY_REGEX), payload: bytes = Body(b"", media_type="text/plain"), ): text, payload_response = _parse_payload(payload) if payload_response is not None: return payload_response tokens = classifier.tokenizer(text) classifier.untrain(category, text) summaries = _map_summaries(classifier) _log_verbose( request, "untrain:", "category=", category, "tokens=", _format_tokens(tokens), "summaries=", str({k: v.tokenTally for k, v in summaries.items()}), ) return MutationResponse(success=True, categories=summaries) @router.post("/classify", response_model=ClassificationResponse) def classify( request: Request, _auth: None = Depends(verify_auth), classifier: SimpleBayes = Depends(_get_classifier), payload: bytes = Body(b"", media_type="text/plain"), ): text, payload_response = _parse_payload(payload) if payload_response is not None: return payload_response tokens = classifier.tokenizer(text) result = classifier.classify_result(text) _log_verbose( request, "classify:", "tokens=", _format_tokens(tokens), "category=", str(result.category), "score=", str(result.score), ) return ClassificationResponse(category=result.category, score=result.score) @router.post("/score") def score( request: Request, _auth: None = Depends(verify_auth), classifier: SimpleBayes = Depends(_get_classifier), payload: bytes = Body(b"", media_type="text/plain"), ): text, payload_response = _parse_payload(payload) if payload_response is not None: return payload_response tokens = classifier.tokenizer(text) scores = classifier.score(text) _log_verbose( request, "score:", "tokens=", _format_tokens(tokens), "scores=", str(scores), ) return scores @router.post("/flush", response_model=MutationResponse) def flush( request: Request, _auth: None = Depends(verify_auth), classifier: SimpleBayes = Depends(_get_classifier), payload: bytes = Body(b"", media_type="text/plain"), ): _, payload_response = _parse_payload(payload) if payload_response is not None: return payload_response classifier.flush() _log_verbose(request, "flush: Flushed all categories") return MutationResponse(success=True, categories=_map_summaries(classifier)) @router.get("/healthz") def healthz() -> Dict[str, str]: return {"status": "ok"} @router.get("/readyz") def readyz(readiness: ReadinessState = Depends(_get_readiness)): if readiness.is_ready: return {"status": "ready"} return JSONResponse(status_code=503, content={"status": "not ready"}) return router hickeroar-simplebayes-4803e70/simplebayes/api/schemas.py000066400000000000000000000007051514723130200233330ustar00rootroot00000000000000from typing import Dict, Optional from pydantic import BaseModel class CategorySummaryResponse(BaseModel): tokenTally: int probNotInCat: float probInCat: float class InfoResponse(BaseModel): categories: Dict[str, CategorySummaryResponse] class MutationResponse(BaseModel): success: bool categories: Dict[str, CategorySummaryResponse] class ClassificationResponse(BaseModel): category: Optional[str] score: float hickeroar-simplebayes-4803e70/simplebayes/categories.py000066400000000000000000000025231514723130200232640ustar00rootroot00000000000000from typing import Dict from simplebayes.category import BayesCategory class BayesCategories: """Acts as a container for various bayes trained categories of content""" def __init__(self): self.categories: Dict[str, BayesCategory] = {} def add_category(self, name: str) -> BayesCategory: """ Adds a bayes category that we can later train :param name: name of the category :type name: str :return: the requested category :rtype: BayesCategory """ category = BayesCategory(name) self.categories[name] = category return category def get_category(self, name: str) -> BayesCategory: """ Returns the expected category. Will KeyError if non existent :param name: name of the category :type name: str :return: the requested category :rtype: BayesCategory """ return self.categories[name] def get_categories(self) -> Dict[str, BayesCategory]: """ :return: dict of all categories :rtype: dict """ return self.categories def delete_category(self, name: str) -> None: """ Deletes an existing category when present. :param name: name of the category :type name: str """ self.categories.pop(name, None) hickeroar-simplebayes-4803e70/simplebayes/category.py000066400000000000000000000036641514723130200227630ustar00rootroot00000000000000from typing import Dict class BayesCategory: """ Represents a trainable category of content for bayesian classification """ def __init__(self, name: str): """ :param name: The name of the category we're creating :type name: str """ self.name: str = name self.tokens: Dict[str, int] = {} self.tally: int = 0 def train_token(self, word: str, count: int) -> None: """ Trains a particular token (increases the weight/count of it) :param word: the token we're going to train :type word: str :param count: the number of occurrences in the sample :type count: int """ if word not in self.tokens: self.tokens[word] = 0 self.tokens[word] += count self.tally += count def untrain_token(self, word: str, count: int) -> None: """ Untrains a particular token (decreases the weight/count of it) :param word: the token we're going to train :type word: str :param count: the number of occurrences in the sample :type count: int """ if word not in self.tokens: return # If we're trying to untrain more tokens than we have, we end at 0 count = min(count, self.tokens[word]) self.tokens[word] -= count self.tally -= count if self.tokens[word] <= 0: del self.tokens[word] def get_token_count(self, word: str) -> int: """ Gets the count associated with a provided token/word :param word: the token we're getting the weight of :type word: str :return: the weight/count of the token :rtype: int """ return self.tokens.get(word, 0) def get_tally(self) -> int: """ Gets the tally of all types :return: The total number of tokens :rtype: int """ return self.tally hickeroar-simplebayes-4803e70/simplebayes/cli.py000066400000000000000000000035351514723130200217120ustar00rootroot00000000000000import argparse import os from typing import Sequence import uvicorn from simplebayes.api.app import create_app def _env_bool(name: str, default: bool) -> bool: """Return True when env value is in ('1', 'true', 'yes'), case-insensitive.""" val = os.getenv(name, "").lower() if not val: return default return val in ("1", "true", "yes") def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run the simplebayes API server.") parser.add_argument("--host", default=os.getenv("SIMPLEBAYES_HOST", "0.0.0.0")) parser.add_argument( "--port", type=int, default=int(os.getenv("SIMPLEBAYES_PORT", "8000")), ) parser.add_argument( "--auth-token", default=os.getenv("SIMPLEBAYES_AUTH_TOKEN", ""), ) parser.add_argument( "--language", default=os.getenv("SIMPLEBAYES_LANGUAGE", "english"), help="Language code for stemmer and stop words (e.g. english, spanish).", ) parser.add_argument( "--remove-stop-words", action="store_true", default=_env_bool("SIMPLEBAYES_REMOVE_STOP_WORDS", False), help="Filter common stop words (the, is, and, etc.).", ) parser.add_argument( "--verbose", action="store_true", default=_env_bool("SIMPLEBAYES_VERBOSE", False), help="Log requests, responses, and classifier operations to stderr.", ) return parser.parse_args(argv) def run(argv: Sequence[str] | None = None) -> None: args = parse_args(argv) app = create_app( auth_token=args.auth_token, language=args.language, remove_stop_words=args.remove_stop_words, verbose=args.verbose, ) uvicorn.run(app, host=args.host, port=args.port) if __name__ == "__main__": # pragma: no cover run() hickeroar-simplebayes-4803e70/simplebayes/constants.py000066400000000000000000000001031514723130200231430ustar00rootroot00000000000000import re CATEGORY_PATTERN = re.compile(r"^[-_A-Za-z0-9]{1,64}$") hickeroar-simplebayes-4803e70/simplebayes/errors.py000066400000000000000000000014171514723130200224540ustar00rootroot00000000000000class SimpleBayesError(Exception): """Base exception for simplebayes domain errors.""" class UnauthorizedError(SimpleBayesError): """Raised when Bearer auth fails. Produces 401 with API-standard response format.""" class InvalidCategoryError(SimpleBayesError): """Raised when a category value is invalid.""" class PersistencePathError(SimpleBayesError): """Raised when a persistence path is invalid.""" class UnsupportedModelVersionError(SimpleBayesError): """Raised when a persisted model version cannot be loaded.""" class InvalidModelStateError(SimpleBayesError): """Raised when persisted model data is malformed or inconsistent.""" class PayloadTooLargeError(SimpleBayesError): """Raised when inbound payload exceeds configured limits.""" hickeroar-simplebayes-4803e70/simplebayes/models.py000066400000000000000000000005721514723130200224240ustar00rootroot00000000000000from dataclasses import dataclass from typing import Optional @dataclass(frozen=True) class ClassificationResult: """Structured classification output.""" category: Optional[str] score: float @dataclass(frozen=True) class CategorySummary: """Summary values for one trained category.""" token_tally: int prob_in_cat: float prob_not_in_cat: float hickeroar-simplebayes-4803e70/simplebayes/persistence.py000066400000000000000000000074671514723130200234770ustar00rootroot00000000000000import json import os import tempfile from typing import Dict, TextIO from simplebayes.constants import CATEGORY_PATTERN from simplebayes.errors import ( InvalidModelStateError, PersistencePathError, UnsupportedModelVersionError, ) PERSISTED_MODEL_VERSION = 1 DEFAULT_MODEL_FILE_PATH = "/tmp/simplebayes-model.json" def dump_model_state(stream: TextIO, model_state: Dict) -> None: if stream is None: raise InvalidModelStateError("destination stream is required") json.dump(model_state, stream) def load_model_state(stream: TextIO) -> Dict: if stream is None: raise InvalidModelStateError("source stream is required") try: state = json.load(stream) except json.JSONDecodeError as exc: raise InvalidModelStateError("unable to decode persisted model") from exc if not isinstance(state, dict): raise InvalidModelStateError("persisted model root must be an object") return state def resolve_model_path(path: str = "") -> str: resolved_path = path.strip() if path else DEFAULT_MODEL_FILE_PATH if not os.path.isabs(resolved_path): raise PersistencePathError("model file path must be absolute") return resolved_path def save_model_state_to_file(path: str, model_state: Dict) -> None: resolved_path = resolve_model_path(path) model_directory = os.path.dirname(resolved_path) os.makedirs(model_directory, exist_ok=True) temp_path = "" try: with tempfile.NamedTemporaryFile( mode="w", encoding="utf-8", delete=False, dir=model_directory, prefix=".simplebayes-", suffix=".tmp", ) as temp_file: temp_path = temp_file.name dump_model_state(temp_file, model_state) temp_file.flush() os.fsync(temp_file.fileno()) os.replace(temp_path, resolved_path) finally: if temp_path and os.path.exists(temp_path): os.remove(temp_path) def load_model_state_from_file(path: str) -> Dict: resolved_path = resolve_model_path(path) with open(resolved_path, "r", encoding="utf-8") as source_file: return load_model_state(source_file) def validate_model_state(state: Dict) -> None: version = state.get("version") if version != PERSISTED_MODEL_VERSION: raise UnsupportedModelVersionError(f"unsupported model version: {version}") categories = state.get("categories") if not isinstance(categories, dict): raise InvalidModelStateError("persisted categories must be an object") for category_name, category_state in categories.items(): if ( not isinstance(category_name, str) or not category_name or not CATEGORY_PATTERN.match(category_name) ): raise InvalidModelStateError("invalid category name in persisted model") if not isinstance(category_state, dict): raise InvalidModelStateError("invalid category payload in persisted model") tally = category_state.get("tally") tokens = category_state.get("tokens") if not isinstance(tally, int) or tally < 0: raise InvalidModelStateError("invalid category tally in persisted model") if not isinstance(tokens, dict): raise InvalidModelStateError("invalid token map in persisted model") token_sum = 0 for token, count in tokens.items(): if not isinstance(token, str) or not token: raise InvalidModelStateError("invalid token name in persisted model") if not isinstance(count, int) or count <= 0: raise InvalidModelStateError("invalid token count in persisted model") token_sum += count if token_sum != tally: raise InvalidModelStateError("token tally mismatch in persisted model") hickeroar-simplebayes-4803e70/simplebayes/py.typed000066400000000000000000000000011514723130200222510ustar00rootroot00000000000000 hickeroar-simplebayes-4803e70/simplebayes/runtime/000077500000000000000000000000001514723130200222465ustar00rootroot00000000000000hickeroar-simplebayes-4803e70/simplebayes/runtime/readiness.py000066400000000000000000000006701514723130200246000ustar00rootroot00000000000000import threading class ReadinessState: def __init__(self) -> None: self._is_ready = True self._lock = threading.Lock() @property def is_ready(self) -> bool: with self._lock: return self._is_ready def mark_ready(self) -> None: with self._lock: self._is_ready = True def mark_not_ready(self) -> None: with self._lock: self._is_ready = False hickeroar-simplebayes-4803e70/simplebayes/stopwords_data.py000066400000000000000000004326501514723130200242040ustar00rootroot00000000000000# pylint: disable=too-many-lines # Built-in stopword lists. No external download or file storage required. # Sourced from stopwords-iso (MIT), prtx/Nepali-Stopwords, arulrajnet/TamilStopWords. # Croatian used for Serbian (mutually intelligible). # Yiddish: function words from Wiktionary/Wortschatz Leipzig frequency, stemmed for filter match. ARABIC = frozenset(( "،", "آض", "آمينَ", "آه", "آهاً", "آي", "أ", "أب", "أجل", "أجمع", "أخ", "أخذ", "أصبح", "أضحى", "أقبل", "أقل", "أكثر", "ألا", "أم", "أما", "أمامك", "أمامكَ", "أمسى", "أمّا", "أن", "أنا", "أنت", "أنتم", "أنتما", "أنتن", "أنتِ", "أنشأ", "أنّى", "أو", "أوشك", "أولئك", "أولئكم", "أولاء", "أولالك", "أوّهْ", "أي", "أيا", "أين", "أينما", "أيّ", "أَنَّ", "أََيُّ", "أُفٍّ", "إذ", "إذا", "إذاً", "إذما", "إذن", "إلى", "إليكم", "إليكما", "إليكنّ", "إليكَ", "إلَيْكَ", "إلّا", "إمّا", "إن", "إنّما", "إي", "إياك", "إياكم", "إياكما", "إياكن", "إيانا", "إياه", "إياها", "إياهم", "إياهما", "إياهن", "إياي", "إيهٍ", "إِنَّ", "ا", "ابتدأ", "اثر", "اجل", "احد", "اخرى", "اخلولق", "اذا", "اربعة", "ارتدّ", "استحال", "اطار", "اعادة", "اعلنت", "اف", "اكثر", "اكد", "الألاء", "الألى", "الا", "الاخيرة", "الان", "الاول", "الاولى", "التى", "التي", "الثاني", "الثانية", "الذاتي", "الذى", "الذي", "الذين", "السابق", "الف", "اللائي", "اللاتي", "اللتان", "اللتيا", "اللتين", "اللذان", "اللذين", "اللواتي", "الماضي", "المقبل", "الوقت", "الى", "اليوم", "اما", "امام", "امس", "ان", "انبرى", "انقلب", "انه", "انها", "او", "اول", "اي", "ايار", "ايام", "ايضا", "ب", "بات", "باسم", "بان", "بخٍ", "برس", "بسبب", "بسّ", "بشكل", "بضع", "بطآن", "بعد", "بعض", "بك", "بكم", "بكما", "بكن", "بل", "بلى", "بما", "بماذا", "بمن", "بن", "بنا", "به", "بها", "بي", "بيد", "بين", "بَسْ", "بَلْهَ", "بِئْسَ", "تانِ", "تانِك", "تبدّل", "تجاه", "تحوّل", "تلقاء", "تلك", "تلكم", "تلكما", "تم", "تينك", "تَيْنِ", "تِه", "تِي", "ثلاثة", "ثم", "ثمّ", "ثمّة", "ثُمَّ", "جعل", "جلل", "جميع", "جير", "حار", "حاشا", "حاليا", "حاي", "حتى", "حرى", "حسب", "حم", "حوالى", "حول", "حيث", "حيثما", "حين", "حيَّ", "حَبَّذَا", "حَتَّى", "حَذارِ", "خلا", "خلال", "دون", "دونك", "ذا", "ذات", "ذاك", "ذانك", "ذانِ", "ذلك", "ذلكم", "ذلكما", "ذلكن", "ذو", "ذوا", "ذواتا", "ذواتي", "ذيت", "ذينك", "ذَيْنِ", "ذِه", "ذِي", "راح", "رجع", "رويدك", "ريث", "رُبَّ", "زيارة", "سبحان", "سرعان", "سنة", "سنوات", "سوف", "سوى", "سَاءَ", "سَاءَمَا", "شبه", "شخصا", "شرع", "شَتَّانَ", "صار", "صباح", "صفر", "صهٍ", "صهْ", "ضد", "ضمن", "طاق", "طالما", "طفق", "طَق", "ظلّ", "عاد", "عام", "عاما", "عامة", "عدا", "عدة", "عدد", "عدم", "عسى", "عشر", "عشرة", "علق", "على", "عليك", "عليه", "عليها", "علًّ", "عن", "عند", "عندما", "عوض", "عين", "عَدَسْ", "عَمَّا", "غدا", "غير", "ـ", "ف", "فان", "فلان", "فو", "فى", "في", "فيم", "فيما", "فيه", "فيها", "قال", "قام", "قبل", "قد", "قطّ", "قلما", "قوة", "كأنّما", "كأين", "كأيّ", "كأيّن", "كاد", "كان", "كانت", "كذا", "كذلك", "كرب", "كل", "كلا", "كلاهما", "كلتا", "كلم", "كليكما", "كليهما", "كلّما", "كلَّا", "كم", "كما", "كي", "كيت", "كيف", "كيفما", "كَأَنَّ", "كِخ", "لئن", "لا", "لات", "لاسيما", "لدن", "لدى", "لعمر", "لقاء", "لك", "لكم", "لكما", "لكن", "لكنَّما", "لكي", "لكيلا", "للامم", "لم", "لما", "لمّا", "لن", "لنا", "له", "لها", "لو", "لوكالة", "لولا", "لوما", "لي", "لَسْتَ", "لَسْتُ", "لَسْتُم", "لَسْتُمَا", "لَسْتُنَّ", "لَسْتِ", "لَسْنَ", "لَعَلَّ", "لَكِنَّ", "لَيْتَ", "لَيْسَ", "لَيْسَا", "لَيْسَتَا", "لَيْسَتْ", "لَيْسُوا", "لَِسْنَا", "ما", "ماانفك", "مابرح", "مادام", "ماذا", "مازال", "مافتئ", "مايو", "متى", "مثل", "مذ", "مساء", "مع", "معاذ", "مقابل", "مكانكم", "مكانكما", "مكانكنّ", "مكانَك", "مليار", "مليون", "مما", "ممن", "من", "منذ", "منها", "مه", "مهما", "مَنْ", "مِن", "نحن", "نحو", "نعم", "نفس", "نفسه", "نهاية", "نَخْ", "نِعِمّا", "نِعْمَ", "ها", "هاؤم", "هاكَ", "هاهنا", "هبّ", "هذا", "هذه", "هكذا", "هل", "هلمَّ", "هلّا", "هم", "هما", "هن", "هنا", "هناك", "هنالك", "هو", "هي", "هيا", "هيت", "هيّا", "هَؤلاء", "هَاتانِ", "هَاتَيْنِ", "هَاتِه", "هَاتِي", "هَجْ", "هَذا", "هَذانِ", "هَذَيْنِ", "هَذِه", "هَذِي", "هَيْهَاتَ", "و", "و6", "وا", "واحد", "واضاف", "واضافت", "واكد", "وان", "واهاً", "واوضح", "وراءَك", "وفي", "وقال", "وقالت", "وقد", "وقف", "وكان", "وكانت", "ولا", "ولم", "ومن", "وهو", "وهي", "ويكأنّ", "وَيْ", "وُشْكَانََ", "يكون", "يمكن", "يوم", "ّأيّان" )) ARMENIAN = frozenset(( "այդ", "այլ", "այն", "այս", "դու", "դուք", "եմ", "են", "ենք", "ես", "եք", "է", "էի", "էին", "էինք", "էիր", "էիք", "էր", "ըստ", "թ", "ի", "ին", "իսկ", "իր", "կամ", "համար", "հետ", "հետո", "մենք", "մեջ", "մի", "ն", "նա", "նաև", "նրա", "նրանք", "որ", "որը", "որոնք", "որպես", "ու", "ում", "պիտի", "վրա", "և" )) BASQUE = frozenset(( "al", "anitz", "arabera", "asko", "baina", "bat", "batean", "batek", "bati", "batzuei", "batzuek", "batzuetan", "batzuk", "bera", "beraiek", "berau", "berauek", "bere", "berori", "beroriek", "beste", "bezala", "da", "dago", "dira", "ditu", "du", "dute", "edo", "egin", "ere", "eta", "eurak", "ez", "gainera", "gu", "gutxi", "guzti", "haiei", "haiek", "haietan", "hainbeste", "hala", "han", "handik", "hango", "hara", "hari", "hark", "hartan", "hau", "hauei", "hauek", "hauetan", "hemen", "hemendik", "hemengo", "hi", "hona", "honek", "honela", "honetan", "honi", "hor", "hori", "horiei", "horiek", "horietan", "horko", "horra", "horrek", "horrela", "horretan", "horri", "hortik", "hura", "izan", "ni", "noiz", "nola", "non", "nondik", "nongo", "nor", "nora", "ze", "zein", "zen", "zenbait", "zenbat", "zer", "zergatik", "ziren", "zituen", "zu", "zuek", "zuen", "zuten" )) CATALAN = frozenset(( "a", "abans", "ací", "ah", "així", "això", "al", "aleshores", "algun", "alguna", "algunes", "alguns", "alhora", "allà", "allí", "allò", "als", "altra", "altre", "altres", "amb", "ambdues", "ambdós", "anar", "ans", "apa", "aquell", "aquella", "aquelles", "aquells", "aquest", "aquesta", "aquestes", "aquests", "aquí", "baix", "bastant", "bé", "cada", "cadascuna", "cadascunes", "cadascuns", "cadascú", "com", "consegueixo", "conseguim", "conseguir", "consigueix", "consigueixen", "consigueixes", "contra", "d'un", "d'una", "d'unes", "d'uns", "dalt", "de", "del", "dels", "des", "des de", "després", "dins", "dintre", "donat", "doncs", "durant", "e", "eh", "el", "elles", "ells", "els", "em", "en", "encara", "ens", "entre", "era", "erem", "eren", "eres", "es", "esta", "estan", "estat", "estava", "estaven", "estem", "esteu", "estic", "està", "estàvem", "estàveu", "et", "etc", "ets", "fa", "faig", "fan", "fas", "fem", "fer", "feu", "fi", "fins", "fora", "gairebé", "ha", "han", "has", "haver", "havia", "he", "hem", "heu", "hi", "ho", "i", "igual", "iguals", "inclòs", "ja", "jo", "l'hi", "la", "les", "li", "li'n", "llarg", "llavors", "m'he", "ma", "mal", "malgrat", "mateix", "mateixa", "mateixes", "mateixos", "me", "mentre", "meu", "meus", "meva", "meves", "mode", "molt", "molta", "moltes", "molts", "mon", "mons", "més", "n'he", "n'hi", "ne", "ni", "no", "nogensmenys", "només", "nosaltres", "nostra", "nostre", "nostres", "o", "oh", "oi", "on", "pas", "pel", "pels", "per", "per que", "perquè", "però", "poc", "poca", "pocs", "podem", "poden", "poder", "podeu", "poques", "potser", "primer", "propi", "puc", "qual", "quals", "quan", "quant", "que", "quelcom", "qui", "quin", "quina", "quines", "quins", "què", "s'ha", "s'han", "sa", "sabem", "saben", "saber", "sabeu", "sap", "saps", "semblant", "semblants", "sense", "ser", "ses", "seu", "seus", "seva", "seves", "si", "sobre", "sobretot", "soc", "solament", "sols", "som", "son", "sons", "sota", "sou", "sóc", "són", "t'ha", "t'han", "t'he", "ta", "tal", "també", "tampoc", "tan", "tant", "tanta", "tantes", "te", "tene", "tenim", "tenir", "teniu", "teu", "teus", "teva", "teves", "tinc", "ton", "tons", "tot", "tota", "totes", "tots", "un", "una", "unes", "uns", "us", "va", "vaig", "vam", "van", "vas", "veu", "vosaltres", "vostra", "vostre", "vostres", "érem", "éreu", "és", "éssent", "últim", "ús" )) DANISH = frozenset(( "ad", "af", "aldrig", "alle", "alt", "anden", "andet", "andre", "at", "bare", "begge", "blev", "blive", "bliver", "da", "de", "dem", "den", "denne", "der", "deres", "det", "dette", "dig", "din", "dine", "disse", "dit", "dog", "du", "efter", "ej", "eller", "en", "end", "ene", "eneste", "enhver", "er", "et", "far", "fem", "fik", "fire", "flere", "fleste", "for", "fordi", "forrige", "fra", "få", "får", "før", "god", "godt", "ham", "han", "hans", "har", "havde", "have", "hej", "helt", "hende", "hendes", "her", "hos", "hun", "hvad", "hvem", "hver", "hvilken", "hvis", "hvor", "hvordan", "hvorfor", "hvornår", "i", "ikke", "ind", "ingen", "intet", "ja", "jeg", "jer", "jeres", "jo", "kan", "kom", "komme", "kommer", "kun", "kunne", "lad", "lav", "lidt", "lige", "lille", "man", "mand", "mange", "med", "meget", "men", "mens", "mere", "mig", "min", "mine", "mit", "mod", "må", "ned", "nej", "ni", "nogen", "noget", "nogle", "nu", "ny", "nyt", "når", "nær", "næste", "næsten", "og", "også", "okay", "om", "op", "os", "otte", "over", "på", "se", "seks", "selv", "ser", "ses", "sig", "sige", "sin", "sine", "sit", "skal", "skulle", "som", "stor", "store", "syv", "så", "sådan", "tag", "tage", "thi", "ti", "til", "to", "tre", "ud", "under", "var", "ved", "vi", "vil", "ville", "vor", "vores", "være", "været" )) DUTCH = frozenset(( "aan", "aangaande", "aangezien", "achte", "achter", "achterna", "af", "afgelopen", "al", "aldaar", "aldus", "alhoewel", "alias", "alle", "allebei", "alleen", "alles", "als", "alsnog", "altijd", "altoos", "ander", "andere", "anders", "anderszins", "beetje", "behalve", "behoudens", "beide", "beiden", "ben", "beneden", "bent", "bepaald", "betreffende", "bij", "bijna", "bijv", "binnen", "binnenin", "blijkbaar", "blijken", "boven", "bovenal", "bovendien", "bovengenoemd", "bovenstaand", "bovenvermeld", "buiten", "bv", "daar", "daardoor", "daarheen", "daarin", "daarna", "daarnet", "daarom", "daarop", "daaruit", "daarvanlangs", "dan", "dat", "de", "deden", "deed", "der", "derde", "derhalve", "dertig", "deze", "dhr", "die", "dikwijls", "dit", "doch", "doe", "doen", "doet", "door", "doorgaand", "drie", "duizend", "dus", "echter", "een", "eens", "eer", "eerdat", "eerder", "eerlang", "eerst", "eerste", "eigen", "eigenlijk", "elk", "elke", "en", "enig", "enige", "enigszins", "enkel", "er", "erdoor", "erg", "ergens", "etc", "etcetera", "even", "eveneens", "evenwel", "gauw", "ge", "gedurende", "geen", "gehad", "gekund", "geleden", "gelijk", "gemoeten", "gemogen", "genoeg", "geweest", "gewoon", "gewoonweg", "haar", "haarzelf", "had", "hadden", "hare", "heb", "hebben", "hebt", "hedden", "heeft", "heel", "hem", "hemzelf", "hen", "het", "hetzelfde", "hier", "hierbeneden", "hierboven", "hierin", "hierna", "hierom", "hij", "hijzelf", "hoe", "hoewel", "honderd", "hun", "hunne", "ieder", "iedere", "iedereen", "iemand", "iets", "ik", "ikzelf", "in", "inderdaad", "inmiddels", "intussen", "inzake", "is", "ja", "je", "jezelf", "jij", "jijzelf", "jou", "jouw", "jouwe", "juist", "jullie", "kan", "klaar", "kon", "konden", "krachtens", "kun", "kunnen", "kunt", "laatst", "later", "liever", "lijken", "lijkt", "maak", "maakt", "maakte", "maakten", "maar", "mag", "maken", "me", "meer", "meest", "meestal", "men", "met", "mevr", "mezelf", "mij", "mijn", "mijnent", "mijner", "mijzelf", "minder", "miss", "misschien", "missen", "mits", "mocht", "mochten", "moest", "moesten", "moet", "moeten", "mogen", "mr", "mrs", "mw", "na", "naar", "nadat", "nam", "namelijk", "nee", "neem", "negen", "nemen", "nergens", "net", "niemand", "niet", "niets", "niks", "noch", "nochtans", "nog", "nogal", "nooit", "nu", "nv", "of", "ofschoon", "om", "omdat", "omhoog", "omlaag", "omstreeks", "omtrent", "omver", "ondanks", "onder", "ondertussen", "ongeveer", "ons", "onszelf", "onze", "onzeker", "ooit", "ook", "op", "opnieuw", "opzij", "over", "overal", "overeind", "overige", "overigens", "paar", "pas", "per", "precies", "recent", "redelijk", "reeds", "rond", "rondom", "samen", "sedert", "sinds", "sindsdien", "slechts", "sommige", "spoedig", "steeds", "tamelijk", "te", "tegen", "tegenover", "tenzij", "terwijl", "thans", "tien", "tiende", "tijdens", "tja", "toch", "toe", "toen", "toenmaals", "toenmalig", "tot", "totdat", "tussen", "twee", "tweede", "u", "uit", "uitgezonderd", "uw", "vaak", "vaakwat", "van", "vanaf", "vandaan", "vanuit", "vanwege", "veel", "veeleer", "veertig", "verder", "verscheidene", "verschillende", "vervolgens", "via", "vier", "vierde", "vijf", "vijfde", "vijftig", "vol", "volgend", "volgens", "voor", "vooraf", "vooral", "vooralsnog", "voorbij", "voordat", "voordezen", "voordien", "voorheen", "voorop", "voorts", "vooruit", "vrij", "vroeg", "waar", "waarom", "waarschijnlijk", "wanneer", "want", "waren", "was", "wat", "we", "wederom", "weer", "weg", "wegens", "weinig", "wel", "weldra", "welk", "welke", "werd", "werden", "werder", "wezen", "whatever", "wie", "wiens", "wier", "wij", "wijzelf", "wil", "wilden", "willen", "word", "worden", "wordt", "zal", "ze", "zei", "zeker", "zelf", "zelfde", "zelfs", "zes", "zeven", "zich", "zichzelf", "zij", "zijn", "zijne", "zijzelf", "zo", "zoals", "zodat", "zodra", "zonder", "zou", "zouden", "zowat", "zulk", "zulke", "zullen", "zult" )) ENGLISH = frozenset(( "'ll", "'tis", "'twas", "'ve", "10", "39", "a", "a's", "able", "ableabout", "about", "above", "abroad", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "adopted", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ago", "ah", "ahead", "ai", "ain't", "aint", "al", "all", "allow", "allows", "almost", "alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "aq", "ar", "are", "area", "areas", "aren", "aren't", "arent", "arise", "around", "arpa", "as", "aside", "ask", "asked", "asking", "asks", "associated", "at", "au", "auth", "available", "aw", "away", "awfully", "az", "b", "ba", "back", "backed", "backing", "backs", "backward", "backwards", "bb", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "began", "begin", "beginning", "beginnings", "begins", "behind", "being", "beings", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bf", "bg", "bh", "bi", "big", "bill", "billion", "biol", "bj", "bm", "bn", "bo", "both", "bottom", "br", "brief", "briefly", "bs", "bt", "but", "buy", "bv", "bw", "by", "bz", "c", "c'mon", "c's", "ca", "call", "came", "can", "can't", "cannot", "cant", "caption", "case", "cases", "cause", "causes", "cc", "cd", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "ck", "cl", "clear", "clearly", "click", "cm", "cmon", "cn", "co", "co.", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "copy", "corresponding", "could", "could've", "couldn", "couldn't", "couldnt", "course", "cr", "cry", "cs", "cu", "currently", "cv", "cx", "cy", "cz", "d", "dare", "daren't", "darent", "date", "de", "dear", "definitely", "describe", "described", "despite", "detail", "did", "didn", "didn't", "didnt", "differ", "different", "differently", "directly", "dj", "dk", "dm", "do", "does", "doesn", "doesn't", "doesnt", "doing", "don", "don't", "done", "dont", "doubtful", "down", "downed", "downing", "downs", "downwards", "due", "during", "dz", "e", "each", "early", "ec", "ed", "edu", "ee", "effect", "eg", "eh", "eight", "eighty", "either", "eleven", "else", "elsewhere", "empty", "end", "ended", "ending", "ends", "enough", "entirely", "er", "es", "especially", "et", "et-al", "etc", "even", "evenly", "ever", "evermore", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "face", "faces", "fact", "facts", "fairly", "far", "farther", "felt", "few", "fewer", "ff", "fi", "fifteen", "fifth", "fifty", "fify", "fill", "find", "finds", "fire", "first", "five", "fix", "fj", "fk", "fm", "fo", "followed", "following", "follows", "for", "forever", "former", "formerly", "forth", "forty", "forward", "found", "four", "fr", "free", "from", "front", "full", "fully", "further", "furthered", "furthering", "furthermore", "furthers", "fx", "g", "ga", "gave", "gb", "gd", "ge", "general", "generally", "get", "gets", "getting", "gf", "gg", "gh", "gi", "give", "given", "gives", "giving", "gl", "gm", "gmt", "gn", "go", "goes", "going", "gone", "good", "goods", "got", "gotten", "gov", "gp", "gq", "gr", "great", "greater", "greatest", "greetings", "group", "grouped", "grouping", "groups", "gs", "gt", "gu", "gw", "gy", "h", "had", "hadn't", "hadnt", "half", "happens", "hardly", "has", "hasn", "hasn't", "hasnt", "have", "haven", "haven't", "havent", "having", "he", "he'd", "he'll", "he's", "hed", "hell", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "herse”", "hes", "hi", "hid", "high", "higher", "highest", "him", "himself", "himse”", "his", "hither", "hk", "hm", "hn", "home", "homepage", "hopefully", "how", "how'd", "how'll", "how's", "howbeit", "however", "hr", "ht", "htm", "html", "http", "hu", "hundred", "i", "i'd", "i'll", "i'm", "i've", "i.e.", "id", "ie", "if", "ignored", "ii", "il", "ill", "im", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "inc.", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "inside", "insofar", "instead", "int", "interest", "interested", "interesting", "interests", "into", "invention", "inward", "io", "iq", "ir", "is", "isn", "isn't", "isnt", "it", "it'd", "it'll", "it's", "itd", "itll", "its", "itself", "itse”", "ive", "j", "je", "jm", "jo", "join", "jp", "just", "k", "ke", "keep", "keeps", "kept", "keys", "kg", "kh", "ki", "kind", "km", "kn", "knew", "know", "known", "knows", "kp", "kr", "kw", "ky", "kz", "l", "la", "large", "largely", "last", "lately", "later", "latest", "latter", "latterly", "lb", "lc", "least", "length", "less", "lest", "let", "let's", "lets", "li", "like", "liked", "likely", "likewise", "line", "little", "lk", "ll", "long", "longer", "longest", "look", "looking", "looks", "low", "lower", "lr", "ls", "lt", "ltd", "lu", "lv", "ly", "m", "ma", "made", "mainly", "make", "makes", "making", "man", "many", "may", "maybe", "mayn't", "maynt", "mc", "md", "me", "mean", "means", "meantime", "meanwhile", "member", "members", "men", "merely", "mg", "mh", "microsoft", "might", "might've", "mightn't", "mightnt", "mil", "mill", "million", "mine", "minus", "miss", "mk", "ml", "mm", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mp", "mq", "mr", "mrs", "ms", "msie", "mt", "mu", "much", "mug", "must", "must've", "mustn't", "mustnt", "mv", "mw", "mx", "my", "myself", "myse”", "mz", "n", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needed", "needing", "needn't", "neednt", "needs", "neither", "net", "netscape", "never", "neverf", "neverless", "nevertheless", "new", "newer", "newest", "next", "nf", "ng", "ni", "nine", "ninety", "nl", "no", "no-one", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "notwithstanding", "novel", "now", "nowhere", "np", "nr", "nu", "null", "number", "numbers", "nz", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "older", "oldest", "om", "omitted", "on", "once", "one", "one's", "ones", "only", "onto", "open", "opened", "opening", "opens", "opposite", "or", "ord", "order", "ordered", "ordering", "orders", "org", "other", "others", "otherwise", "ought", "oughtn't", "oughtnt", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "pa", "page", "pages", "part", "parted", "particular", "particularly", "parting", "parts", "past", "pe", "per", "perhaps", "pf", "pg", "ph", "pk", "pl", "place", "placed", "places", "please", "plus", "pm", "pmid", "pn", "point", "pointed", "pointing", "points", "poorly", "possible", "possibly", "potentially", "pp", "pr", "predominantly", "present", "presented", "presenting", "presents", "presumably", "previously", "primarily", "probably", "problem", "problems", "promptly", "proud", "provided", "provides", "pt", "put", "puts", "pw", "py", "q", "qa", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "reserved", "respectively", "resulted", "resulting", "results", "right", "ring", "ro", "room", "rooms", "round", "ru", "run", "rw", "s", "sa", "said", "same", "saw", "say", "saying", "says", "sb", "sc", "sd", "se", "sec", "second", "secondly", "seconds", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "sees", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "seventy", "several", "sg", "sh", "shall", "shan't", "shant", "she", "she'd", "she'll", "she's", "shed", "shell", "shes", "should", "should've", "shouldn", "shouldn't", "shouldnt", "show", "showed", "showing", "shown", "showns", "shows", "si", "side", "sides", "significant", "significantly", "similar", "similarly", "since", "sincere", "site", "six", "sixty", "sj", "sk", "sl", "slightly", "sm", "small", "smaller", "smallest", "sn", "so", "some", "somebody", "someday", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "sr", "st", "state", "states", "still", "stop", "strongly", "su", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sv", "sy", "system", "sz", "t", "t's", "take", "taken", "taking", "tc", "td", "tell", "ten", "tends", "test", "text", "tf", "tg", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "that's", "that've", "thatll", "thats", "thatve", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there'd", "there'll", "there're", "there's", "there've", "thereafter", "thereby", "thered", "therefore", "therein", "therell", "thereof", "therere", "theres", "thereto", "thereupon", "thereve", "these", "they", "they'd", "they'll", "they're", "they've", "theyd", "theyll", "theyre", "theyve", "thick", "thin", "thing", "things", "think", "thinks", "third", "thirty", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thought", "thoughts", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "til", "till", "tip", "tis", "tj", "tk", "tm", "tn", "to", "today", "together", "too", "took", "top", "toward", "towards", "tp", "tr", "tried", "tries", "trillion", "truly", "try", "trying", "ts", "tt", "turn", "turned", "turning", "turns", "tv", "tw", "twas", "twelve", "twenty", "twice", "two", "tz", "u", "ua", "ug", "uk", "um", "un", "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "upwards", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "uucp", "uy", "uz", "v", "va", "value", "various", "vc", "ve", "versus", "very", "vg", "vi", "via", "viz", "vn", "vol", "vols", "vs", "vu", "w", "want", "wanted", "wanting", "wants", "was", "wasn", "wasn't", "wasnt", "way", "ways", "we", "we'd", "we'll", "we're", "we've", "web", "webpage", "website", "wed", "welcome", "well", "wells", "went", "were", "weren", "weren't", "werent", "weve", "wf", "what", "what'd", "what'll", "what's", "what've", "whatever", "whatll", "whats", "whatve", "when", "when'd", "when'll", "when's", "whence", "whenever", "where", "where'd", "where'll", "where's", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whim", "whither", "who", "who'd", "who'll", "who's", "whod", "whoever", "whole", "wholl", "whom", "whomever", "whos", "whose", "why", "why'd", "why'll", "why's", "widely", "width", "will", "willing", "wish", "with", "within", "without", "won", "won't", "wonder", "wont", "words", "work", "worked", "working", "works", "world", "would", "would've", "wouldn", "wouldn't", "wouldnt", "ws", "www", "x", "y", "ye", "year", "years", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "youd", "youll", "young", "younger", "youngest", "your", "youre", "yours", "yourself", "yourselves", "youve", "yt", "yu", "z", "za", "zero", "zm", "zr" )) ESPERANTO = frozenset(( "adiaŭ", "ajn", "al", "ankoraŭ", "antaŭ", "aŭ", "bonan", "bonvole", "bonvolu", "bv", "ci", "cia", "cian", "cin", "d-ro", "da", "de", "dek", "deka", "do", "doktor'", "doktoro", "du", "dua", "dum", "eble", "ekz", "ekzemple", "en", "estas", "estis", "estos", "estu", "estus", "eĉ", "f-no", "feliĉan", "for", "fraŭlino", "ha", "havas", "havis", "havos", "havu", "havus", "he", "ho", "hu", "ili", "ilia", "ilian", "ilin", "inter", "io", "ion", "iu", "iujn", "iun", "ja", "jam", "je", "jes", "k", "kaj", "ke", "kio", "kion", "kiu", "kiujn", "kiun", "kvankam", "kvar", "kvara", "kvazaŭ", "kvin", "kvina", "la", "li", "lia", "lian", "lin", "malantaŭ", "male", "malgraŭ", "mem", "mi", "mia", "mian", "min", "minus", "naŭ", "naŭa", "ne", "nek", "nenio", "nenion", "neniu", "neniun", "nepre", "ni", "nia", "nian", "nin", "nu", "nun", "nur", "ok", "oka", "oni", "onia", "onian", "onin", "plej", "pli", "plu", "plus", "por", "post", "preter", "s-no", "s-ro", "se", "sed", "sep", "sepa", "ses", "sesa", "si", "sia", "sian", "sin", "sinjor'", "sinjorino", "sinjoro", "sub", "super", "supren", "sur", "tamen", "tio", "tion", "tiu", "tiujn", "tiun", "tra", "tri", "tria", "tuj", "tute", "unu", "unua", "ve", "verŝajne", "vi", "via", "vian", "vin", "ĉi", "ĉio", "ĉion", "ĉiu", "ĉiujn", "ĉiun", "ĉu", "ĝi", "ĝia", "ĝian", "ĝin", "ĝis", "ĵus", "ŝi", "ŝia", "ŝin" )) ESTONIAN = frozenset(( "aga", "ei", "et", "ja", "jah", "kas", "kui", "kõik", "ma", "me", "mida", "midagi", "mind", "minu", "mis", "mu", "mul", "mulle", "nad", "nii", "oled", "olen", "oli", "oma", "on", "pole", "sa", "seda", "see", "selle", "siin", "siis", "ta", "te", "ära" )) FINNISH = frozenset(( "aiemmin", "aika", "aikaa", "aikaan", "aikaisemmin", "aikaisin", "aikajen", "aikana", "aikoina", "aikoo", "aikovat", "aina", "ainakaan", "ainakin", "ainoa", "ainoat", "aiomme", "aion", "aiotte", "aist", "aivan", "ajan", "alas", "alemmas", "alkuisin", "alkuun", "alla", "alle", "aloitamme", "aloitan", "aloitat", "aloitatte", "aloitattivat", "aloitettava", "aloitettevaksi", "aloitettu", "aloitimme", "aloitin", "aloitit", "aloititte", "aloittaa", "aloittamatta", "aloitti", "aloittivat", "alta", "aluksi", "alussa", "alusta", "annettavaksi", "annetteva", "annettu", "ansiosta", "antaa", "antamatta", "antoi", "aoua", "apu", "asia", "asiaa", "asian", "asiasta", "asiat", "asioiden", "asioihin", "asioita", "asti", "avuksi", "avulla", "avun", "avutta", "edelle", "edelleen", "edellä", "edeltä", "edemmäs", "edes", "edessä", "edestä", "ehkä", "ei", "eikä", "eilen", "eivät", "eli", "ellei", "elleivät", "ellemme", "ellen", "ellet", "ellette", "emme", "en", "enemmän", "eniten", "ennen", "ensi", "ensimmäinen", "ensimmäiseksi", "ensimmäisen", "ensimmäisenä", "ensimmäiset", "ensimmäisiksi", "ensimmäisinä", "ensimmäisiä", "ensimmäistä", "ensin", "entinen", "entisen", "entisiä", "entisten", "entistä", "enää", "eri", "erittäin", "erityisesti", "eräiden", "eräs", "eräät", "esi", "esiin", "esillä", "esimerkiksi", "et", "eteen", "etenkin", "etessa", "ette", "ettei", "että", "haikki", "halua", "haluaa", "haluamatta", "haluamme", "haluan", "haluat", "haluatte", "haluavat", "halunnut", "halusi", "halusimme", "halusin", "halusit", "halusitte", "halusivat", "halutessa", "haluton", "he", "hei", "heidän", "heidät", "heihin", "heille", "heillä", "heiltä", "heissä", "heistä", "heitä", "helposti", "heti", "hetkellä", "hieman", "hitaasti", "hoikein", "huolimatta", "huomenna", "hyvien", "hyviin", "hyviksi", "hyville", "hyviltä", "hyvin", "hyvinä", "hyvissä", "hyvistä", "hyviä", "hyvä", "hyvät", "hyvää", "hän", "häneen", "hänelle", "hänellä", "häneltä", "hänen", "hänessä", "hänestä", "hänet", "häntä", "ihan", "ilman", "ilmeisesti", "itse", "itsensä", "itseään", "ja", "jo", "johon", "joiden", "joihin", "joiksi", "joilla", "joille", "joilta", "joina", "joissa", "joista", "joita", "joka", "jokainen", "jokin", "joko", "joksi", "joku", "jolla", "jolle", "jolloin", "jolta", "jompikumpi", "jona", "jonka", "jonkin", "jonne", "joo", "jopa", "jos", "joskus", "jossa", "josta", "jota", "jotain", "joten", "jotenkin", "jotenkuten", "jotka", "jotta", "jouduimme", "jouduin", "jouduit", "jouduitte", "joudumme", "joudun", "joudutte", "joukkoon", "joukossa", "joukosta", "joutua", "joutui", "joutuivat", "joutumaan", "joutuu", "joutuvat", "juuri", "jälkeen", "jälleen", "jää", "kahdeksan", "kahdeksannen", "kahdella", "kahdelle", "kahdelta", "kahden", "kahdessa", "kahdesta", "kahta", "kahteen", "kai", "kaiken", "kaikille", "kaikilta", "kaikkea", "kaikki", "kaikkia", "kaikkiaan", "kaikkialla", "kaikkialle", "kaikkialta", "kaikkien", "kaikkin", "kaksi", "kannalta", "kannattaa", "kanssa", "kanssaan", "kanssamme", "kanssani", "kanssanne", "kanssasi", "kauan", "kauemmas", "kaukana", "kautta", "kehen", "keiden", "keihin", "keiksi", "keille", "keillä", "keiltä", "keinä", "keissä", "keistä", "keitten", "keittä", "keitä", "keneen", "keneksi", "kenelle", "kenellä", "keneltä", "kenen", "kenenä", "kenessä", "kenestä", "kenet", "kenettä", "kennessästä", "kenties", "kerran", "kerta", "kertaa", "keskellä", "kesken", "keskimäärin", "ketkä", "ketä", "kiitos", "kohti", "koko", "kokonaan", "kolmas", "kolme", "kolmen", "kolmesti", "koska", "koskaan", "kovin", "kuin", "kuinka", "kuinkan", "kuitenkaan", "kuitenkin", "kuka", "kukaan", "kukin", "kukka", "kumpainen", "kumpainenkaan", "kumpi", "kumpikaan", "kumpikin", "kun", "kuten", "kuuden", "kuusi", "kuutta", "kylliksi", "kyllä", "kymmenen", "kyse", "liian", "liki", "lisäksi", "lisää", "lla", "luo", "luona", "lähekkäin", "lähelle", "lähellä", "läheltä", "lähemmäs", "lähes", "lähinnä", "lähtien", "läpi", "mahdollisimman", "mahdollista", "me", "meidän", "meidät", "meihin", "meille", "meillä", "meiltä", "meissä", "meistä", "meitä", "melkein", "melko", "menee", "meneet", "menemme", "menen", "menet", "menette", "menevät", "meni", "menimme", "menin", "menit", "menivät", "mennessä", "mennyt", "menossa", "mihin", "mikin", "miksi", "mikä", "mikäli", "mikään", "mille", "milloin", "milloinkan", "millä", "miltä", "minkä", "minne", "minua", "minulla", "minulle", "minulta", "minun", "minussa", "minusta", "minut", "minuun", "minä", "missä", "mistä", "miten", "mitkä", "mitä", "mitään", "moi", "molemmat", "mones", "monesti", "monet", "moni", "moniaalla", "moniaalle", "moniaalta", "monta", "muassa", "muiden", "muita", "muka", "mukaan", "mukaansa", "mukana", "mutta", "muu", "muualla", "muualle", "muualta", "muuanne", "muulloin", "muun", "muut", "muuta", "muutama", "muutaman", "muuten", "myöhemmin", "myös", "myöskin", "myöskään", "myötä", "ne", "neljä", "neljän", "neljää", "niiden", "niihin", "niiksi", "niille", "niillä", "niiltä", "niin", "niinä", "niissä", "niistä", "niitä", "noiden", "noihin", "noiksi", "noilla", "noille", "noilta", "noin", "noina", "noissa", "noista", "noita", "nopeammin", "nopeasti", "nopeiten", "nro", "nuo", "nyt", "näiden", "näihin", "näiksi", "näille", "näillä", "näiltä", "näin", "näinä", "näissä", "näissähin", "näissälle", "näissältä", "näissästä", "näistä", "näitä", "nämä", "ohi", "oikea", "oikealla", "oikein", "ole", "olemme", "olen", "olet", "olette", "oleva", "olevan", "olevat", "oli", "olimme", "olin", "olisi", "olisimme", "olisin", "olisit", "olisitte", "olisivat", "olit", "olitte", "olivat", "olla", "olleet", "olli", "ollut", "oma", "omaa", "omaan", "omaksi", "omalle", "omalta", "oman", "omassa", "omat", "omia", "omien", "omiin", "omiksi", "omille", "omilta", "omissa", "omista", "on", "onkin", "onko", "ovat", "paikoittain", "paitsi", "pakosti", "paljon", "paremmin", "parempi", "parhaillaan", "parhaiten", "perusteella", "peräti", "pian", "pieneen", "pieneksi", "pienelle", "pienellä", "pieneltä", "pienempi", "pienestä", "pieni", "pienin", "poikki", "puolesta", "puolestaan", "päälle", "runsaasti", "saakka", "sadam", "sama", "samaa", "samaan", "samalla", "samallalta", "samallassa", "samallasta", "saman", "samat", "samoin", "sata", "sataa", "satojen", "se", "seitsemän", "sekä", "sen", "seuraavat", "siellä", "sieltä", "siihen", "siinä", "siis", "siitä", "sijaan", "siksi", "sille", "silloin", "sillä", "silti", "siltä", "sinne", "sinua", "sinulla", "sinulle", "sinulta", "sinun", "sinussa", "sinusta", "sinut", "sinuun", "sinä", "sisäkkäin", "sisällä", "siten", "sitten", "sitä", "ssa", "sta", "suoraan", "suuntaan", "suuren", "suuret", "suuri", "suuria", "suurin", "suurten", "taa", "taas", "taemmas", "tahansa", "tai", "takaa", "takaisin", "takana", "takia", "tallä", "tapauksessa", "tarpeeksi", "tavalla", "tavoitteena", "te", "teidän", "teidät", "teihin", "teille", "teillä", "teiltä", "teissä", "teistä", "teitä", "tietysti", "todella", "toinen", "toisaalla", "toisaalle", "toisaalta", "toiseen", "toiseksi", "toisella", "toiselle", "toiselta", "toisemme", "toisen", "toisensa", "toisessa", "toisesta", "toista", "toistaiseksi", "toki", "tosin", "tuhannen", "tuhat", "tule", "tulee", "tulemme", "tulen", "tulet", "tulette", "tulevat", "tulimme", "tulin", "tulisi", "tulisimme", "tulisin", "tulisit", "tulisitte", "tulisivat", "tulit", "tulitte", "tulivat", "tulla", "tulleet", "tullut", "tuntuu", "tuo", "tuohon", "tuoksi", "tuolla", "tuolle", "tuolloin", "tuolta", "tuon", "tuona", "tuonne", "tuossa", "tuosta", "tuota", "tuotä", "tuskin", "tykö", "tähän", "täksi", "tälle", "tällä", "tällöin", "tältä", "tämä", "tämän", "tänne", "tänä", "tänään", "tässä", "tästä", "täten", "tätä", "täysin", "täytyvät", "täytyy", "täällä", "täältä", "ulkopuolella", "usea", "useasti", "useimmiten", "usein", "useita", "uudeksi", "uudelleen", "uuden", "uudet", "uusi", "uusia", "uusien", "uusinta", "uuteen", "uutta", "vaan", "vahemmän", "vai", "vaiheessa", "vaikea", "vaikean", "vaikeat", "vaikeilla", "vaikeille", "vaikeilta", "vaikeissa", "vaikeista", "vaikka", "vain", "varmasti", "varsin", "varsinkin", "varten", "vasen", "vasenmalla", "vasta", "vastaan", "vastakkain", "vastan", "verran", "vielä", "vierekkäin", "vieressä", "vieri", "viiden", "viime", "viimeinen", "viimeisen", "viimeksi", "viisi", "voi", "voidaan", "voimme", "voin", "voisi", "voit", "voitte", "voivat", "vuoden", "vuoksi", "vuosi", "vuosien", "vuosina", "vuotta", "vähemmän", "vähintään", "vähiten", "vähän", "välillä", "yhdeksän", "yhden", "yhdessä", "yhteen", "yhteensä", "yhteydessä", "yhteyteen", "yhtä", "yhtäälle", "yhtäällä", "yhtäältä", "yhtään", "yhä", "yksi", "yksin", "yksittäin", "yleensä", "ylemmäs", "yli", "ylös", "ympäri", "älköön", "älä" )) FRENCH = frozenset(( "a", "abord", "absolument", "afin", "ah", "ai", "aie", "aient", "aies", "ailleurs", "ainsi", "ait", "allaient", "allo", "allons", "allô", "alors", "anterieur", "anterieure", "anterieures", "apres", "après", "as", "assez", "attendu", "au", "aucun", "aucune", "aucuns", "aujourd", "aujourd'hui", "aupres", "auquel", "aura", "aurai", "auraient", "aurais", "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aussi", "autant", "autre", "autrefois", "autrement", "autres", "autrui", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avez", "aviez", "avions", "avoir", "avons", "ayant", "ayez", "ayons", "b", "bah", "bas", "basee", "bat", "beau", "beaucoup", "bien", "bigre", "bon", "boum", "bravo", "brrr", "c", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "celà", "cent", "cependant", "certain", "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chacune", "chaque", "cher", "chers", "chez", "chiche", "chut", "chère", "chères", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", "clac", "clic", "combien", "comme", "comment", "comparable", "comparables", "compris", "concernant", "contre", "couic", "crac", "d", "da", "dans", "de", "debout", "dedans", "dehors", "deja", "delà", "depuis", "dernier", "derniere", "derriere", "derrière", "des", "desormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", "devrait", "different", "differentes", "differents", "différent", "différente", "différentes", "différents", "dire", "directe", "directement", "dit", "dite", "dits", "divers", "diverse", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dixième", "doit", "doivent", "donc", "dont", "dos", "douze", "douzième", "dring", "droite", "du", "duquel", "durant", "dès", "début", "désormais", "e", "effet", "egale", "egalement", "egales", "eh", "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "enfin", "entre", "envers", "environ", "es", "essai", "est", "et", "etant", "etc", "etre", "eu", "eue", "eues", "euh", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", "eussions", "eut", "eux", "eux-mêmes", "exactement", "excepté", "extenso", "exterieur", "eûmes", "eût", "eûtes", "f", "fais", "faisaient", "faisant", "fait", "faites", "façon", "feront", "fi", "flac", "floc", "fois", "font", "force", "furent", "fus", "fusse", "fussent", "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes", "g", "gens", "h", "ha", "haut", "hein", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "hé", "hélas", "i", "ici", "il", "ils", "importe", "j", "je", "jusqu", "jusque", "juste", "k", "l", "la", "laisser", "laquelle", "las", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "longtemps", "lors", "lorsque", "lui", "lui-meme", "lui-même", "là", "lès", "m", "ma", "maint", "maintenant", "mais", "malgre", "malgré", "maximale", "me", "meme", "memes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mince", "mine", "minimale", "moi", "moi-meme", "moi-même", "moindres", "moins", "mon", "mot", "moyennant", "multiple", "multiples", "même", "mêmes", "n", "na", "naturel", "naturelle", "naturelles", "ne", "neanmoins", "necessaire", "necessairement", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "nommés", "non", "nos", "notamment", "notre", "nous", "nous-mêmes", "nouveau", "nouveaux", "nul", "néanmoins", "nôtre", "nôtres", "o", "oh", "ohé", "ollé", "olé", "on", "ont", "onze", "onzième", "ore", "ou", "ouf", "ouias", "oust", "ouste", "outre", "ouvert", "ouverte", "ouverts", "o|", "où", "p", "paf", "pan", "par", "parce", "parfois", "parle", "parlent", "parler", "parmi", "parole", "parseme", "partant", "particulier", "particulière", "particulièrement", "pas", "passé", "pendant", "pense", "permet", "personne", "personnes", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "pire", "pièce", "plein", "plouf", "plupart", "plus", "plusieurs", "plutôt", "possessif", "possessifs", "possible", "possibles", "pouah", "pour", "pourquoi", "pourrais", "pourrait", "pouvait", "prealable", "precisement", "premier", "première", "premièrement", "pres", "probable", "probante", "procedant", "proche", "près", "psitt", "pu", "puis", "puisque", "pur", "pure", "q", "qu", "quand", "quant", "quant-à-soi", "quanta", "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", "quelqu'un", "quelque", "quelques", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "rare", "rarement", "rares", "relative", "relativement", "remarquable", "rend", "rendre", "restant", "reste", "restent", "restrictif", "retour", "revoici", "revoilà", "rien", "s", "sa", "sacrebleu", "sait", "sans", "sapristi", "sauf", "se", "sein", "seize", "selon", "semblable", "semblaient", "semble", "semblent", "sent", "sept", "septième", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "ses", "seul", "seule", "seulement", "si", "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même", "soient", "sois", "soit", "soixante", "sommes", "son", "sont", "sous", "souvent", "soyez", "soyons", "specifique", "specifiques", "speculatif", "stop", "strictement", "subtiles", "suffisant", "suffisante", "suffit", "suis", "suit", "suivant", "suivante", "suivantes", "suivants", "suivre", "sujet", "superpose", "sur", "surtout", "t", "ta", "tac", "tandis", "tant", "tardive", "te", "tel", "telle", "tellement", "telles", "tels", "tenant", "tend", "tenir", "tente", "tes", "tic", "tien", "tienne", "tiennes", "tiens", "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "tres", "trois", "troisième", "troisièmement", "trop", "très", "tsoin", "tsouin", "tu", "té", "u", "un", "une", "unes", "uniformement", "unique", "uniques", "uns", "v", "va", "vais", "valeur", "vas", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", "voici", "voie", "voient", "voilà", "voire", "vont", "vos", "votre", "vous", "vous-mêmes", "vu", "vé", "vôtre", "vôtres", "w", "x", "y", "z", "zut", "à", "â", "ça", "ès", "étaient", "étais", "était", "étant", "état", "étiez", "étions", "été", "étée", "étées", "étés", "êtes", "être", "ô" )) GERMAN = frozenset(( "a", "ab", "aber", "ach", "acht", "achte", "achten", "achter", "achtes", "ag", "alle", "allein", "allem", "allen", "aller", "allerdings", "alles", "allgemeinen", "als", "also", "am", "an", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "au", "auch", "auf", "aus", "ausser", "ausserdem", "außer", "außerdem", "b", "bald", "bei", "beide", "beiden", "beim", "beispiel", "bekannt", "bereits", "besonders", "besser", "besten", "bin", "bis", "bisher", "bist", "c", "d", "d.h", "da", "dabei", "dadurch", "dafür", "dagegen", "daher", "dahin", "dahinter", "damals", "damit", "danach", "daneben", "dank", "dann", "daran", "darauf", "daraus", "darf", "darfst", "darin", "darum", "darunter", "darüber", "das", "dasein", "daselbst", "dass", "dasselbe", "davon", "davor", "dazu", "dazwischen", "daß", "dein", "deine", "deinem", "deinen", "deiner", "deines", "dem", "dementsprechend", "demgegenüber", "demgemäss", "demgemäß", "demselben", "demzufolge", "den", "denen", "denn", "denselben", "der", "deren", "derer", "derjenige", "derjenigen", "dermassen", "dermaßen", "derselbe", "derselben", "des", "deshalb", "desselben", "dessen", "deswegen", "dich", "die", "diejenige", "diejenigen", "dies", "diese", "dieselbe", "dieselben", "diesem", "diesen", "dieser", "dieses", "dir", "doch", "dort", "drei", "drin", "dritte", "dritten", "dritter", "drittes", "du", "durch", "durchaus", "durfte", "durften", "dürfen", "dürft", "e", "eben", "ebenso", "ehrlich", "ei", "ei,", "eigen", "eigene", "eigenen", "eigener", "eigenes", "ein", "einander", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "eins", "elf", "en", "ende", "endlich", "entweder", "er", "ernst", "erst", "erste", "ersten", "erster", "erstes", "es", "etwa", "etwas", "euch", "euer", "eure", "eurem", "euren", "eurer", "eures", "f", "folgende", "früher", "fünf", "fünfte", "fünften", "fünfter", "fünftes", "für", "g", "gab", "ganz", "ganze", "ganzen", "ganzer", "ganzes", "gar", "gedurft", "gegen", "gegenüber", "gehabt", "gehen", "geht", "gekannt", "gekonnt", "gemacht", "gemocht", "gemusst", "genug", "gerade", "gern", "gesagt", "geschweige", "gewesen", "gewollt", "geworden", "gibt", "ging", "gleich", "gott", "gross", "grosse", "grossen", "grosser", "grosses", "groß", "große", "großen", "großer", "großes", "gut", "gute", "guter", "gutes", "h", "hab", "habe", "haben", "habt", "hast", "hat", "hatte", "hatten", "hattest", "hattet", "heisst", "her", "heute", "hier", "hin", "hinter", "hoch", "hätte", "hätten", "i", "ich", "ihm", "ihn", "ihnen", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "immer", "in", "indem", "infolgedessen", "ins", "irgend", "ist", "j", "ja", "jahr", "jahre", "jahren", "je", "jede", "jedem", "jeden", "jeder", "jedermann", "jedermanns", "jedes", "jedoch", "jemand", "jemandem", "jemanden", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "k", "kam", "kann", "kannst", "kaum", "kein", "keine", "keinem", "keinen", "keiner", "keines", "kleine", "kleinen", "kleiner", "kleines", "kommen", "kommt", "konnte", "konnten", "kurz", "können", "könnt", "könnte", "l", "lang", "lange", "leicht", "leide", "lieber", "los", "m", "machen", "macht", "machte", "mag", "magst", "mahn", "mal", "man", "manche", "manchem", "manchen", "mancher", "manches", "mann", "mehr", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mensch", "menschen", "mich", "mir", "mit", "mittel", "mochte", "mochten", "morgen", "muss", "musst", "musste", "mussten", "muß", "mußt", "möchte", "mögen", "möglich", "mögt", "müssen", "müsst", "müßt", "n", "na", "nach", "nachdem", "nahm", "natürlich", "neben", "nein", "neue", "neuen", "neun", "neunte", "neunten", "neunter", "neuntes", "nicht", "nichts", "nie", "niemand", "niemandem", "niemanden", "noch", "nun", "nur", "o", "ob", "oben", "oder", "offen", "oft", "ohne", "ordnung", "p", "q", "r", "recht", "rechte", "rechten", "rechter", "rechtes", "richtig", "rund", "s", "sa", "sache", "sagt", "sagte", "sah", "satt", "schlecht", "schluss", "schon", "sechs", "sechste", "sechsten", "sechster", "sechstes", "sehr", "sei", "seid", "seien", "sein", "seine", "seinem", "seinen", "seiner", "seines", "seit", "seitdem", "selbst", "sich", "sie", "sieben", "siebente", "siebenten", "siebenter", "siebentes", "sind", "so", "solang", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollen", "sollst", "sollt", "sollte", "sollten", "sondern", "sonst", "soweit", "sowie", "später", "startseite", "statt", "steht", "suche", "t", "tag", "tage", "tagen", "tat", "teil", "tel", "tritt", "trotzdem", "tun", "u", "uhr", "um", "und", "uns", "unse", "unsem", "unsen", "unser", "unsere", "unserer", "unses", "unter", "v", "vergangenen", "viel", "viele", "vielem", "vielen", "vielleicht", "vier", "vierte", "vierten", "vierter", "viertes", "vom", "von", "vor", "w", "wahr", "wann", "war", "waren", "warst", "wart", "warum", "was", "weg", "wegen", "weil", "weit", "weiter", "weitere", "weiteren", "weiteres", "welche", "welchem", "welchen", "welcher", "welches", "wem", "wen", "wenig", "wenige", "weniger", "weniges", "wenigstens", "wenn", "wer", "werde", "werden", "werdet", "weshalb", "wessen", "wie", "wieder", "wieso", "will", "willst", "wir", "wird", "wirklich", "wirst", "wissen", "wo", "woher", "wohin", "wohl", "wollen", "wollt", "wollte", "wollten", "worden", "wurde", "wurden", "während", "währenddem", "währenddessen", "wäre", "würde", "würden", "x", "y", "z", "z.b", "zehn", "zehnte", "zehnten", "zehnter", "zehntes", "zeit", "zu", "zuerst", "zugleich", "zum", "zunächst", "zur", "zurück", "zusammen", "zwanzig", "zwar", "zwei", "zweite", "zweiten", "zweiter", "zweites", "zwischen", "zwölf", "über", "überhaupt", "übrigens" )) GREEK = frozenset(( "ένα", "έναν", "ένας", "αι", "ακομα", "ακομη", "ακριβως", "αληθεια", "αληθινα", "αλλα", "αλλαχου", "αλλες", "αλλη", "αλλην", "αλλης", "αλλιως", "αλλιωτικα", "αλλο", "αλλοι", "αλλοιως", "αλλοιωτικα", "αλλον", "αλλος", "αλλοτε", "αλλου", "αλλους", "αλλων", "αμα", "αμεσα", "αμεσως", "αν", "ανα", "αναμεσα", "αναμεταξυ", "ανευ", "αντι", "αντιπερα", "αντις", "ανω", "ανωτερω", "αξαφνα", "απ", "απεναντι", "απο", "αποψε", "από", "αρα", "αραγε", "αργα", "αργοτερο", "αριστερα", "αρκετα", "αρχικα", "ας", "αυριο", "αυτα", "αυτες", "αυτεσ", "αυτη", "αυτην", "αυτης", "αυτο", "αυτοι", "αυτον", "αυτος", "αυτοσ", "αυτου", "αυτους", "αυτουσ", "αυτων", "αφοτου", "αφου", "αἱ", "αἳ", "αἵ", "αὐτόσ", "αὐτὸς", "αὖ", "α∆ιακοπα", "βεβαια", "βεβαιοτατα", "γάρ", "γα", "γα^", "γε", "γι", "για", "γοῦν", "γρηγορα", "γυρω", "γὰρ", "δ'", "δέ", "δή", "δαί", "δαίσ", "δαὶ", "δαὶς", "δε", "δεν", "δι", "δι'", "διά", "δια", "διὰ", "δὲ", "δὴ", "δ’", "εαν", "εαυτο", "εαυτον", "εαυτου", "εαυτους", "εαυτων", "εγκαιρα", "εγκαιρως", "εγω", "ειθε", "ειμαι", "ειμαστε", "ειναι", "εις", "εισαι", "εισαστε", "ειστε", "ειτε", "ειχα", "ειχαμε", "ειχαν", "ειχατε", "ειχε", "ειχες", "ει∆εμη", "εκ", "εκαστα", "εκαστες", "εκαστη", "εκαστην", "εκαστης", "εκαστο", "εκαστοι", "εκαστον", "εκαστος", "εκαστου", "εκαστους", "εκαστων", "εκει", "εκεινα", "εκεινες", "εκεινεσ", "εκεινη", "εκεινην", "εκεινης", "εκεινο", "εκεινοι", "εκεινον", "εκεινος", "εκεινοσ", "εκεινου", "εκεινους", "εκεινουσ", "εκεινων", "εκτος", "εμας", "εμεις", "εμενα", "εμπρος", "εν", "ενα", "εναν", "ενας", "ενος", "εντελως", "εντος", "εντωμεταξυ", "ενω", "ενός", "εξ", "εξαφνα", "εξης", "εξισου", "εξω", "επ", "επί", "επανω", "επειτα", "επει∆η", "επι", "επισης", "επομενως", "εσας", "εσεις", "εσενα", "εστω", "εσυ", "ετερα", "ετεραι", "ετερας", "ετερες", "ετερη", "ετερης", "ετερο", "ετεροι", "ετερον", "ετερος", "ετερου", "ετερους", "ετερων", "ετουτα", "ετουτες", "ετουτη", "ετουτην", "ετουτης", "ετουτο", "ετουτοι", "ετουτον", "ετουτος", "ετουτου", "ετουτους", "ετουτων", "ετσι", "ευγε", "ευθυς", "ευτυχως", "εφεξης", "εχει", "εχεις", "εχετε", "εχθες", "εχομε", "εχουμε", "εχουν", "εχτες", "εχω", "εως", "εἰ", "εἰμί", "εἰμὶ", "εἰς", "εἰσ", "εἴ", "εἴμι", "εἴτε", "ε∆ω", "η", "ημασταν", "ημαστε", "ημουν", "ησασταν", "ησαστε", "ησουν", "ηταν", "ητανε", "ητοι", "ηττον", "η∆η", "θα", "ι", "ιι", "ιιι", "ισαμε", "ισια", "ισως", "ισωσ", "ι∆ια", "ι∆ιαν", "ι∆ιας", "ι∆ιες", "ι∆ιο", "ι∆ιοι", "ι∆ιον", "ι∆ιος", "ι∆ιου", "ι∆ιους", "ι∆ιων", "ι∆ιως", "κ", "καί", "καίτοι", "καθ", "καθε", "καθεμια", "καθεμιας", "καθενα", "καθενας", "καθενος", "καθετι", "καθολου", "καθως", "και", "κακα", "κακως", "καλα", "καλως", "καμια", "καμιαν", "καμιας", "καμποσα", "καμποσες", "καμποση", "καμποσην", "καμποσης", "καμποσο", "καμποσοι", "καμποσον", "καμποσος", "καμποσου", "καμποσους", "καμποσων", "κανεις", "κανεν", "κανενα", "κανεναν", "κανενας", "κανενος", "καποια", "καποιαν", "καποιας", "καποιες", "καποιο", "καποιοι", "καποιον", "καποιος", "καποιου", "καποιους", "καποιων", "καποτε", "καπου", "καπως", "κατ", "κατά", "κατα", "κατι", "κατιτι", "κατοπιν", "κατω", "κατὰ", "καὶ", "κι", "κιολας", "κλπ", "κοντα", "κτλ", "κυριως", "κἀν", "κἂν", "λιγακι", "λιγο", "λιγωτερο", "λογω", "λοιπα", "λοιπον", "μέν", "μέσα", "μή", "μήτε", "μία", "μα", "μαζι", "μακαρι", "μακρυα", "μαλιστα", "μαλλον", "μας", "με", "μεθ", "μεθαυριο", "μειον", "μελει", "μελλεται", "μεμιας", "μεν", "μερικα", "μερικες", "μερικοι", "μερικους", "μερικων", "μεσα", "μετ", "μετά", "μετα", "μεταξυ", "μετὰ", "μεχρι", "μη", "μην", "μηπως", "μητε", "μη∆ε", "μιά", "μια", "μιαν", "μιας", "μολις", "μολονοτι", "μοναχα", "μονες", "μονη", "μονην", "μονης", "μονο", "μονοι", "μονομιας", "μονος", "μονου", "μονους", "μονων", "μου", "μπορει", "μπορουν", "μπραβο", "μπρος", "μἐν", "μὲν", "μὴ", "μὴν", "να", "ναι", "νωρις", "ξανα", "ξαφνικα", "ο", "οι", "ολα", "ολες", "ολη", "ολην", "ολης", "ολο", "ολογυρα", "ολοι", "ολον", "ολονεν", "ολος", "ολοτελα", "ολου", "ολους", "ολων", "ολως", "ολως∆ιολου", "ομως", "ομωσ", "οποια", "οποιαν", "οποιαν∆ηποτε", "οποιας", "οποιας∆ηποτε", "οποια∆ηποτε", "οποιες", "οποιες∆ηποτε", "οποιο", "οποιοι", "οποιον", "οποιον∆ηποτε", "οποιος", "οποιος∆ηποτε", "οποιου", "οποιους", "οποιους∆ηποτε", "οποιου∆ηποτε", "οποιο∆ηποτε", "οποιων", "οποιων∆ηποτε", "οποι∆ηποτε", "οποτε", "οποτε∆ηποτε", "οπου", "οπου∆ηποτε", "οπως", "οπωσ", "ορισμενα", "ορισμενες", "ορισμενων", "ορισμενως", "οσα", "οσα∆ηποτε", "οσες", "οσες∆ηποτε", "οση", "οσην", "οσην∆ηποτε", "οσης", "οσης∆ηποτε", "οση∆ηποτε", "οσο", "οσοι", "οσοι∆ηποτε", "οσον", "οσον∆ηποτε", "οσος", "οσος∆ηποτε", "οσου", "οσους", "οσους∆ηποτε", "οσου∆ηποτε", "οσο∆ηποτε", "οσων", "οσων∆ηποτε", "οταν", "οτι", "οτι∆ηποτε", "οτου", "ου", "ουτε", "ου∆ε", "οχι", "οἱ", "οἳ", "οἷς", "οὐ", "οὐδ", "οὐδέ", "οὐδείσ", "οὐδεὶς", "οὐδὲ", "οὐδὲν", "οὐκ", "οὐχ", "οὐχὶ", "οὓς", "οὔτε", "οὕτω", "οὕτως", "οὕτωσ", "οὖν", "οὗ", "οὗτος", "οὗτοσ", "παλι", "παντοτε", "παντου", "παντως", "παρ", "παρά", "παρα", "παρὰ", "περί", "περα", "περι", "περιπου", "περισσοτερο", "περσι", "περυσι", "περὶ", "πια", "πιθανον", "πιο", "πισω", "πλαι", "πλεον", "πλην", "ποια", "ποιαν", "ποιας", "ποιες", "ποιεσ", "ποιο", "ποιοι", "ποιον", "ποιος", "ποιοσ", "ποιου", "ποιους", "ποιουσ", "ποιων", "πολυ", "ποσες", "ποση", "ποσην", "ποσης", "ποσοι", "ποσος", "ποσους", "ποτε", "που", "πουθε", "πουθενα", "ποῦ", "πρεπει", "πριν", "προ", "προκειμενου", "προκειται", "προπερσι", "προς", "προσ", "προτου", "προχθες", "προχτες", "πρωτυτερα", "πρόσ", "πρὸ", "πρὸς", "πως", "πωσ", "σαν", "σας", "σε", "σεις", "σημερα", "σιγα", "σου", "στα", "στη", "στην", "στης", "στις", "στο", "στον", "στου", "στους", "στων", "συγχρονως", "συν", "συναμα", "συνεπως", "συνηθως", "συχνα", "συχνας", "συχνες", "συχνη", "συχνην", "συχνης", "συχνο", "συχνοι", "συχνον", "συχνος", "συχνου", "συχνους", "συχνων", "συχνως", "σχε∆ον", "σωστα", "σόσ", "σύ", "σύν", "σὸς", "σὺ", "σὺν", "τά", "τήν", "τί", "τίς", "τίσ", "τα", "ταυτα", "ταυτες", "ταυτη", "ταυτην", "ταυτης", "ταυτο,ταυτον", "ταυτος", "ταυτου", "ταυτων", "ταχα", "ταχατε", "ταῖς", "τα∆ε", "τε", "τελικα", "τελικως", "τες", "τετοια", "τετοιαν", "τετοιας", "τετοιες", "τετοιο", "τετοιοι", "τετοιον", "τετοιος", "τετοιου", "τετοιους", "τετοιων", "τη", "την", "της", "τησ", "τι", "τινα", "τιποτα", "τιποτε", "τις", "τισ", "το", "τοί", "τοι", "τοιοῦτος", "τοιοῦτοσ", "τον", "τος", "τοσα", "τοσες", "τοση", "τοσην", "τοσης", "τοσο", "τοσοι", "τοσον", "τοσος", "τοσου", "τοσους", "τοσων", "τοτε", "του", "τουλαχιστο", "τουλαχιστον", "τους", "τουτα", "τουτες", "τουτη", "τουτην", "τουτης", "τουτο", "τουτοι", "τουτοις", "τουτον", "τουτος", "τουτου", "τουτους", "τουτων", "τούσ", "τοὺς", "τοῖς", "τοῦ", "τυχον", "των", "τωρα", "τό", "τόν", "τότε", "τὰ", "τὰς", "τὴν", "τὸ", "τὸν", "τῆς", "τῆσ", "τῇ", "τῶν", "τῷ", "υπ", "υπερ", "υπο", "υποψη", "υποψιν", "υπό", "υστερα", "φετος", "χαμηλα", "χθες", "χτες", "χωρις", "χωριστα", "ψηλα", "ω", "ωραια", "ως", "ωσ", "ωσαν", "ωσοτου", "ωσπου", "ωστε", "ωστοσο", "ωχ", "ἀλλ'", "ἀλλά", "ἀλλὰ", "ἀλλ’", "ἀπ", "ἀπό", "ἀπὸ", "ἀφ", "ἂν", "ἃ", "ἄλλος", "ἄλλοσ", "ἄν", "ἄρα", "ἅμα", "ἐάν", "ἐγώ", "ἐγὼ", "ἐκ", "ἐμόσ", "ἐμὸς", "ἐν", "ἐξ", "ἐπί", "ἐπεὶ", "ἐπὶ", "ἐστι", "ἐφ", "ἐὰν", "ἑαυτοῦ", "ἔτι", "ἡ", "ἢ", "ἣ", "ἤ", "ἥ", "ἧς", "ἵνα", "ὁ", "ὃ", "ὃν", "ὃς", "ὅ", "ὅδε", "ὅθεν", "ὅπερ", "ὅς", "ὅσ", "ὅστις", "ὅστισ", "ὅτε", "ὅτι", "ὑμόσ", "ὑπ", "ὑπέρ", "ὑπό", "ὑπὲρ", "ὑπὸ", "ὡς", "ὡσ", "ὥς", "ὥστε", "ὦ", "ᾧ", "∆α", "∆ε", "∆εινα", "∆εν", "∆εξια", "∆ηθεν", "∆ηλα∆η", "∆ι", "∆ια", "∆ιαρκως", "∆ικα", "∆ικο", "∆ικοι", "∆ικος", "∆ικου", "∆ικους", "∆ιολου", "∆ιπλα", "∆ιχως" )) HINDI = frozenset(( "अंदर", "अत", "अदि", "अप", "अपना", "अपनि", "अपनी", "अपने", "अभि", "अभी", "आदि", "आप", "इंहिं", "इंहें", "इंहों", "इतयादि", "इत्यादि", "इन", "इनका", "इन्हीं", "इन्हें", "इन्हों", "इस", "इसका", "इसकि", "इसकी", "इसके", "इसमें", "इसि", "इसी", "इसे", "उंहिं", "उंहें", "उंहों", "उन", "उनका", "उनकि", "उनकी", "उनके", "उनको", "उन्हीं", "उन्हें", "उन्हों", "उस", "उसके", "उसि", "उसी", "उसे", "एक", "एवं", "एस", "एसे", "ऐसे", "ओर", "और", "कइ", "कई", "कर", "करता", "करते", "करना", "करने", "करें", "कहते", "कहा", "का", "काफि", "काफ़ी", "कि", "किंहें", "किंहों", "कितना", "किन्हें", "किन्हों", "किया", "किर", "किस", "किसि", "किसी", "किसे", "की", "कुछ", "कुल", "के", "को", "कोइ", "कोई", "कोन", "कोनसा", "कौन", "कौनसा", "गया", "घर", "जब", "जहाँ", "जहां", "जा", "जिंहें", "जिंहों", "जितना", "जिधर", "जिन", "जिन्हें", "जिन्हों", "जिस", "जिसे", "जीधर", "जेसा", "जेसे", "जैसा", "जैसे", "जो", "तक", "तब", "तरह", "तिंहें", "तिंहों", "तिन", "तिन्हें", "तिन्हों", "तिस", "तिसे", "तो", "था", "थि", "थी", "थे", "दबारा", "दवारा", "दिया", "दुसरा", "दुसरे", "दूसरे", "दो", "द्वारा", "न", "नहिं", "नहीं", "ना", "निचे", "निहायत", "नीचे", "ने", "पर", "पहले", "पुरा", "पूरा", "पे", "फिर", "बनि", "बनी", "बहि", "बही", "बहुत", "बाद", "बाला", "बिलकुल", "भि", "भितर", "भी", "भीतर", "मगर", "मानो", "मे", "में", "यदि", "यह", "यहाँ", "यहां", "यहि", "यही", "या", "यिह", "ये", "रखें", "रवासा", "रहा", "रहे", "ऱ्वासा", "लिए", "लिये", "लेकिन", "व", "वगेरह", "वरग", "वर्ग", "वह", "वहाँ", "वहां", "वहिं", "वहीं", "वाले", "वुह", "वे", "वग़ैरह", "संग", "सकता", "सकते", "सबसे", "सभि", "सभी", "साथ", "साबुत", "साभ", "सारा", "से", "सो", "हि", "ही", "हुअ", "हुआ", "हुइ", "हुई", "हुए", "हे", "हें", "है", "हैं", "हो", "होता", "होति", "होती", "होते", "होना", "होने" )) HUNGARIAN = frozenset(( "a", "abba", "abban", "abból", "addig", "ahhoz", "ahogy", "ahol", "aki", "akik", "akkor", "akár", "alapján", "alatt", "alatta", "alattad", "alattam", "alattatok", "alattuk", "alattunk", "alá", "alád", "alájuk", "alám", "alánk", "alátok", "alól", "alóla", "alólad", "alólam", "alólatok", "alóluk", "alólunk", "amely", "amelybol", "amelyek", "amelyekben", "amelyeket", "amelyet", "amelyik", "amelynek", "ami", "amikor", "amit", "amolyan", "amott", "amíg", "annak", "annál", "arra", "arról", "attól", "az", "aznap", "azok", "azokat", "azokba", "azokban", "azokból", "azokhoz", "azokig", "azokkal", "azokká", "azoknak", "azoknál", "azokon", "azokra", "azokról", "azoktól", "azokért", "azon", "azonban", "azonnal", "azt", "aztán", "azután", "azzal", "azzá", "azért", "bal", "balra", "ban", "be", "belé", "beléd", "beléjük", "belém", "belénk", "belétek", "belül", "belőle", "belőled", "belőlem", "belőletek", "belőlük", "belőlünk", "ben", "benne", "benned", "bennem", "bennetek", "bennük", "bennünk", "bár", "bárcsak", "bármilyen", "búcsú", "cikk", "cikkek", "cikkeket", "csak", "csakhogy", "csupán", "de", "dehogy", "e", "ebbe", "ebben", "ebből", "eddig", "egy", "egyebek", "egyebet", "egyedül", "egyelőre", "egyes", "egyet", "egyetlen", "egyik", "egymás", "egyre", "egyszerre", "egyéb", "együtt", "egész", "egészen", "ehhez", "ekkor", "el", "eleinte", "ellen", "ellenes", "elleni", "ellenére", "elmondta", "elsõ", "első", "elsők", "elsősorban", "elsőt", "elé", "eléd", "elég", "eléjük", "elém", "elénk", "elétek", "elõ", "elõször", "elõtt", "elő", "előbb", "elől", "előle", "előled", "előlem", "előletek", "előlük", "előlünk", "először", "előtt", "előtte", "előtted", "előttem", "előttetek", "előttük", "előttünk", "előző", "emilyen", "engem", "ennek", "ennyi", "ennél", "enyém", "erre", "erről", "esetben", "ettől", "ez", "ezek", "ezekbe", "ezekben", "ezekből", "ezeken", "ezeket", "ezekhez", "ezekig", "ezekkel", "ezekké", "ezeknek", "ezeknél", "ezekre", "ezekről", "ezektől", "ezekért", "ezen", "ezentúl", "ezer", "ezret", "ezt", "ezután", "ezzel", "ezzé", "ezért", "fel", "fele", "felek", "felet", "felett", "felé", "fent", "fenti", "fél", "fölé", "gyakran", "ha", "halló", "hamar", "hanem", "harmadik", "harmadikat", "harminc", "hat", "hatodik", "hatodikat", "hatot", "hatvan", "helyett", "hetedik", "hetediket", "hetet", "hetven", "hirtelen", "hiszen", "hiába", "hogy", "hogyan", "hol", "holnap", "holnapot", "honnan", "hova", "hozzá", "hozzád", "hozzájuk", "hozzám", "hozzánk", "hozzátok", "hurrá", "huszadik", "hány", "hányszor", "hármat", "három", "hát", "hátha", "hátulsó", "hét", "húsz", "ide", "ide-оda", "idén", "igazán", "igen", "ill", "ill.", "illetve", "ilyen", "ilyenkor", "immár", "inkább", "is", "ismét", "ison", "itt", "jelenleg", "jobban", "jobbra", "jó", "jól", "jólesik", "jóval", "jövőre", "kell", "kellene", "kellett", "kelljen", "keressünk", "keresztül", "ketten", "kettő", "kettőt", "kevés", "ki", "kiben", "kiből", "kicsit", "kicsoda", "kihez", "kik", "kikbe", "kikben", "kikből", "kiken", "kiket", "kikhez", "kikkel", "kikké", "kiknek", "kiknél", "kikre", "kikről", "kiktől", "kikért", "kilenc", "kilencedik", "kilencediket", "kilencet", "kilencven", "kin", "kinek", "kinél", "kire", "kiről", "kit", "kitől", "kivel", "kivé", "kié", "kiért", "korábban", "képest", "kérem", "kérlek", "kész", "késő", "később", "későn", "két", "kétszer", "kívül", "körül", "köszönhetően", "köszönöm", "közben", "közel", "közepesen", "közepén", "közé", "között", "közül", "külön", "különben", "különböző", "különbözőbb", "különbözőek", "lassan", "le", "legalább", "legyen", "lehet", "lehetetlen", "lehetett", "lehetőleg", "lehetőség", "lenne", "lenni", "lennék", "lennének", "lesz", "leszek", "lesznek", "leszünk", "lett", "lettek", "lettem", "lettünk", "lévő", "ma", "maga", "magad", "magam", "magatokat", "magukat", "magunkat", "magát", "mai", "majd", "majdnem", "manapság", "meg", "megcsinál", "megcsinálnak", "megint", "megvan", "mellett", "mellette", "melletted", "mellettem", "mellettetek", "mellettük", "mellettünk", "mellé", "melléd", "melléjük", "mellém", "mellénk", "mellétek", "mellől", "mellőle", "mellőled", "mellőlem", "mellőletek", "mellőlük", "mellőlünk", "mely", "melyek", "melyik", "mennyi", "mert", "mi", "miatt", "miatta", "miattad", "miattam", "miattatok", "miattuk", "miattunk", "mibe", "miben", "miből", "mihez", "mik", "mikbe", "mikben", "mikből", "miken", "miket", "mikhez", "mikkel", "mikké", "miknek", "miknél", "mikor", "mikre", "mikről", "miktől", "mikért", "milyen", "min", "mind", "mindegyik", "mindegyiket", "minden", "mindenesetre", "mindenki", "mindent", "mindenütt", "mindig", "mindketten", "minek", "minket", "mint", "mintha", "minél", "mire", "miről", "mit", "mitől", "mivel", "mivé", "miért", "mondta", "most", "mostanáig", "már", "más", "másik", "másikat", "másnap", "második", "másodszor", "mások", "másokat", "mást", "még", "mégis", "míg", "mögé", "mögéd", "mögéjük", "mögém", "mögénk", "mögétek", "mögött", "mögötte", "mögötted", "mögöttem", "mögöttetek", "mögöttük", "mögöttünk", "mögül", "mögüle", "mögüled", "mögülem", "mögületek", "mögülük", "mögülünk", "múltkor", "múlva", "na", "nagy", "nagyobb", "nagyon", "naponta", "napot", "ne", "negyedik", "negyediket", "negyven", "neked", "nekem", "neki", "nekik", "nektek", "nekünk", "nem", "nemcsak", "nemrég", "nincs", "nyolc", "nyolcadik", "nyolcadikat", "nyolcat", "nyolcvan", "nála", "nálad", "nálam", "nálatok", "náluk", "nálunk", "négy", "négyet", "néha", "néhány", "nélkül", "o", "oda", "ok", "olyan", "onnan", "ott", "pedig", "persze", "pár", "például", "rajta", "rajtad", "rajtam", "rajtatok", "rajtuk", "rajtunk", "rendben", "rosszul", "rá", "rád", "rájuk", "rám", "ránk", "rátok", "régen", "régóta", "részére", "róla", "rólad", "rólam", "rólatok", "róluk", "rólunk", "rögtön", "s", "saját", "se", "sem", "semmi", "semmilyen", "semmiség", "senki", "soha", "sok", "sokan", "sokat", "sokkal", "sokszor", "sokáig", "során", "stb.", "szemben", "szerbusz", "szerint", "szerinte", "szerinted", "szerintem", "szerintetek", "szerintük", "szerintünk", "szervusz", "szinte", "számára", "száz", "századik", "százat", "szépen", "szét", "szíves", "szívesen", "szíveskedjék", "sőt", "talán", "tavaly", "te", "tegnap", "tegnapelőtt", "tehát", "tele", "teljes", "tessék", "ti", "tied", "titeket", "tizedik", "tizediket", "tizenegy", "tizenegyedik", "tizenhat", "tizenhárom", "tizenhét", "tizenkettedik", "tizenkettő", "tizenkilenc", "tizenkét", "tizennyolc", "tizennégy", "tizenöt", "tizet", "tovább", "további", "továbbá", "távol", "téged", "tényleg", "tíz", "több", "többi", "többször", "túl", "tőle", "tőled", "tőlem", "tőletek", "tőlük", "tőlünk", "ugyanakkor", "ugyanez", "ugyanis", "ugye", "urak", "uram", "urat", "utoljára", "utolsó", "után", "utána", "vagy", "vagyis", "vagyok", "vagytok", "vagyunk", "vajon", "valahol", "valaki", "valakit", "valamelyik", "valami", "valamint", "való", "van", "vannak", "vele", "veled", "velem", "veletek", "velük", "velünk", "vissza", "viszlát", "viszont", "viszontlátásra", "volna", "volnának", "volnék", "volt", "voltak", "voltam", "voltunk", "végre", "végén", "végül", "által", "általában", "ám", "át", "éljen", "én", "éppen", "érte", "érted", "értem", "értetek", "értük", "értünk", "és", "év", "évben", "éve", "évek", "éves", "évi", "évvel", "így", "óta", "õ", "õk", "õket", "ön", "önbe", "önben", "önből", "önhöz", "önnek", "önnel", "önnél", "önre", "önről", "önt", "öntől", "önért", "önök", "önökbe", "önökben", "önökből", "önöket", "önökhöz", "önökkel", "önöknek", "önöknél", "önökre", "önökről", "önöktől", "önökért", "önökön", "önön", "össze", "öt", "ötven", "ötödik", "ötödiket", "ötöt", "úgy", "úgyis", "úgynevezett", "új", "újabb", "újra", "úr", "ő", "ők", "őket", "őt" )) INDONESIAN = frozenset(( "ada", "adalah", "adanya", "adapun", "agak", "agaknya", "agar", "akan", "akankah", "akhir", "akhiri", "akhirnya", "aku", "akulah", "amat", "amatlah", "anda", "andalah", "antar", "antara", "antaranya", "apa", "apaan", "apabila", "apakah", "apalagi", "apatah", "artinya", "asal", "asalkan", "atas", "atau", "ataukah", "ataupun", "awal", "awalnya", "bagai", "bagaikan", "bagaimana", "bagaimanakah", "bagaimanapun", "bagi", "bagian", "bahkan", "bahwa", "bahwasanya", "baik", "bakal", "bakalan", "balik", "banyak", "bapak", "baru", "bawah", "beberapa", "begini", "beginian", "beginikah", "beginilah", "begitu", "begitukah", "begitulah", "begitupun", "bekerja", "belakang", "belakangan", "belum", "belumlah", "benar", "benarkah", "benarlah", "berada", "berakhir", "berakhirlah", "berakhirnya", "berapa", "berapakah", "berapalah", "berapapun", "berarti", "berawal", "berbagai", "berdatangan", "beri", "berikan", "berikut", "berikutnya", "berjumlah", "berkali-kali", "berkata", "berkehendak", "berkeinginan", "berkenaan", "berlainan", "berlalu", "berlangsung", "berlebihan", "bermacam", "bermacam-macam", "bermaksud", "bermula", "bersama", "bersama-sama", "bersiap", "bersiap-siap", "bertanya", "bertanya-tanya", "berturut", "berturut-turut", "bertutur", "berujar", "berupa", "besar", "betul", "betulkah", "biasa", "biasanya", "bila", "bilakah", "bisa", "bisakah", "boleh", "bolehkah", "bolehlah", "buat", "bukan", "bukankah", "bukanlah", "bukannya", "bulan", "bung", "cara", "caranya", "cukup", "cukupkah", "cukuplah", "cuma", "dahulu", "dalam", "dan", "dapat", "dari", "daripada", "datang", "dekat", "demi", "demikian", "demikianlah", "dengan", "depan", "di", "dia", "diakhiri", "diakhirinya", "dialah", "diantara", "diantaranya", "diberi", "diberikan", "diberikannya", "dibuat", "dibuatnya", "didapat", "didatangkan", "digunakan", "diibaratkan", "diibaratkannya", "diingat", "diingatkan", "diinginkan", "dijawab", "dijelaskan", "dijelaskannya", "dikarenakan", "dikatakan", "dikatakannya", "dikerjakan", "diketahui", "diketahuinya", "dikira", "dilakukan", "dilalui", "dilihat", "dimaksud", "dimaksudkan", "dimaksudkannya", "dimaksudnya", "diminta", "dimintai", "dimisalkan", "dimulai", "dimulailah", "dimulainya", "dimungkinkan", "dini", "dipastikan", "diperbuat", "diperbuatnya", "dipergunakan", "diperkirakan", "diperlihatkan", "diperlukan", "diperlukannya", "dipersoalkan", "dipertanyakan", "dipunyai", "diri", "dirinya", "disampaikan", "disebut", "disebutkan", "disebutkannya", "disini", "disinilah", "ditambahkan", "ditandaskan", "ditanya", "ditanyai", "ditanyakan", "ditegaskan", "ditujukan", "ditunjuk", "ditunjuki", "ditunjukkan", "ditunjukkannya", "ditunjuknya", "dituturkan", "dituturkannya", "diucapkan", "diucapkannya", "diungkapkan", "dong", "dua", "dulu", "empat", "enggak", "enggaknya", "entah", "entahlah", "guna", "gunakan", "hal", "hampir", "hanya", "hanyalah", "hari", "harus", "haruslah", "harusnya", "hendak", "hendaklah", "hendaknya", "hingga", "ia", "ialah", "ibarat", "ibaratkan", "ibaratnya", "ibu", "ikut", "ingat", "ingat-ingat", "ingin", "inginkah", "inginkan", "ini", "inikah", "inilah", "itu", "itukah", "itulah", "jadi", "jadilah", "jadinya", "jangan", "jangankan", "janganlah", "jauh", "jawab", "jawaban", "jawabnya", "jelas", "jelaskan", "jelaslah", "jelasnya", "jika", "jikalau", "juga", "jumlah", "jumlahnya", "justru", "kala", "kalau", "kalaulah", "kalaupun", "kalian", "kami", "kamilah", "kamu", "kamulah", "kan", "kapan", "kapankah", "kapanpun", "karena", "karenanya", "kasus", "kata", "katakan", "katakanlah", "katanya", "ke", "keadaan", "kebetulan", "kecil", "kedua", "keduanya", "keinginan", "kelamaan", "kelihatan", "kelihatannya", "kelima", "keluar", "kembali", "kemudian", "kemungkinan", "kemungkinannya", "kenapa", "kepada", "kepadanya", "kesampaian", "keseluruhan", "keseluruhannya", "keterlaluan", "ketika", "khususnya", "kini", "kinilah", "kira", "kira-kira", "kiranya", "kita", "kitalah", "kok", "kurang", "lagi", "lagian", "lah", "lain", "lainnya", "lalu", "lama", "lamanya", "lanjut", "lanjutnya", "lebih", "lewat", "lima", "luar", "macam", "maka", "makanya", "makin", "malah", "malahan", "mampu", "mampukah", "mana", "manakala", "manalagi", "masa", "masalah", "masalahnya", "masih", "masihkah", "masing", "masing-masing", "mau", "maupun", "melainkan", "melakukan", "melalui", "melihat", "melihatnya", "memang", "memastikan", "memberi", "memberikan", "membuat", "memerlukan", "memihak", "meminta", "memintakan", "memisalkan", "memperbuat", "mempergunakan", "memperkirakan", "memperlihatkan", "mempersiapkan", "mempersoalkan", "mempertanyakan", "mempunyai", "memulai", "memungkinkan", "menaiki", "menambahkan", "menandaskan", "menanti", "menanti-nanti", "menantikan", "menanya", "menanyai", "menanyakan", "mendapat", "mendapatkan", "mendatang", "mendatangi", "mendatangkan", "menegaskan", "mengakhiri", "mengapa", "mengatakan", "mengatakannya", "mengenai", "mengerjakan", "mengetahui", "menggunakan", "menghendaki", "mengibaratkan", "mengibaratkannya", "mengingat", "mengingatkan", "menginginkan", "mengira", "mengucapkan", "mengucapkannya", "mengungkapkan", "menjadi", "menjawab", "menjelaskan", "menuju", "menunjuk", "menunjuki", "menunjukkan", "menunjuknya", "menurut", "menuturkan", "menyampaikan", "menyangkut", "menyatakan", "menyebutkan", "menyeluruh", "menyiapkan", "merasa", "mereka", "merekalah", "merupakan", "meski", "meskipun", "meyakini", "meyakinkan", "minta", "mirip", "misal", "misalkan", "misalnya", "mula", "mulai", "mulailah", "mulanya", "mungkin", "mungkinkah", "nah", "naik", "namun", "nanti", "nantinya", "nyaris", "nyatanya", "oleh", "olehnya", "pada", "padahal", "padanya", "pak", "paling", "panjang", "pantas", "para", "pasti", "pastilah", "penting", "pentingnya", "per", "percuma", "perlu", "perlukah", "perlunya", "pernah", "persoalan", "pertama", "pertama-tama", "pertanyaan", "pertanyakan", "pihak", "pihaknya", "pukul", "pula", "pun", "punya", "rasa", "rasanya", "rata", "rupanya", "saat", "saatnya", "saja", "sajalah", "saling", "sama", "sama-sama", "sambil", "sampai", "sampai-sampai", "sampaikan", "sana", "sangat", "sangatlah", "satu", "saya", "sayalah", "se", "sebab", "sebabnya", "sebagai", "sebagaimana", "sebagainya", "sebagian", "sebaik", "sebaik-baiknya", "sebaiknya", "sebaliknya", "sebanyak", "sebegini", "sebegitu", "sebelum", "sebelumnya", "sebenarnya", "seberapa", "sebesar", "sebetulnya", "sebisanya", "sebuah", "sebut", "sebutlah", "sebutnya", "secara", "secukupnya", "sedang", "sedangkan", "sedemikian", "sedikit", "sedikitnya", "seenaknya", "segala", "segalanya", "segera", "seharusnya", "sehingga", "seingat", "sejak", "sejauh", "sejenak", "sejumlah", "sekadar", "sekadarnya", "sekali", "sekali-kali", "sekalian", "sekaligus", "sekalipun", "sekarang", "sekecil", "seketika", "sekiranya", "sekitar", "sekitarnya", "sekurang-kurangnya", "sekurangnya", "sela", "selagi", "selain", "selaku", "selalu", "selama", "selama-lamanya", "selamanya", "selanjutnya", "seluruh", "seluruhnya", "semacam", "semakin", "semampu", "semampunya", "semasa", "semasih", "semata", "semata-mata", "semaunya", "sementara", "semisal", "semisalnya", "sempat", "semua", "semuanya", "semula", "sendiri", "sendirian", "sendirinya", "seolah", "seolah-olah", "seorang", "sepanjang", "sepantasnya", "sepantasnyalah", "seperlunya", "seperti", "sepertinya", "sepihak", "sering", "seringnya", "serta", "serupa", "sesaat", "sesama", "sesampai", "sesegera", "sesekali", "seseorang", "sesuatu", "sesuatunya", "sesudah", "sesudahnya", "setelah", "setempat", "setengah", "seterusnya", "setiap", "setiba", "setibanya", "setidak-tidaknya", "setidaknya", "setinggi", "seusai", "sewaktu", "siap", "siapa", "siapakah", "siapapun", "sini", "sinilah", "soal", "soalnya", "suatu", "sudah", "sudahkah", "sudahlah", "supaya", "tadi", "tadinya", "tahu", "tahun", "tak", "tambah", "tambahnya", "tampak", "tampaknya", "tandas", "tandasnya", "tanpa", "tanya", "tanyakan", "tanyanya", "tapi", "tegas", "tegasnya", "telah", "tempat", "tengah", "tentang", "tentu", "tentulah", "tentunya", "tepat", "terakhir", "terasa", "terbanyak", "terdahulu", "terdapat", "terdiri", "terhadap", "terhadapnya", "teringat", "teringat-ingat", "terjadi", "terjadilah", "terjadinya", "terkira", "terlalu", "terlebih", "terlihat", "termasuk", "ternyata", "tersampaikan", "tersebut", "tersebutlah", "tertentu", "tertuju", "terus", "terutama", "tetap", "tetapi", "tiap", "tiba", "tiba-tiba", "tidak", "tidakkah", "tidaklah", "tiga", "tinggi", "toh", "tunjuk", "turut", "tutur", "tuturnya", "ucap", "ucapnya", "ujar", "ujarnya", "umum", "umumnya", "ungkap", "ungkapnya", "untuk", "usah", "usai", "waduh", "wah", "wahai", "waktu", "waktunya", "walau", "walaupun", "wong", "yaitu", "yakin", "yakni", "yang" )) IRISH = frozenset(( "a", "ach", "ag", "agus", "an", "aon", "ar", "arna", "as", "b'", "ba", "beirt", "bhúr", "caoga", "ceathair", "ceathrar", "chomh", "chtó", "chuig", "chun", "cois", "céad", "cúig", "cúigear", "d'", "daichead", "dar", "de", "deich", "deichniúr", "den", "dhá", "do", "don", "dtí", "dá", "dár", "dó", "faoi", "faoin", "faoina", "faoinár", "fara", "fiche", "gach", "gan", "go", "gur", "haon", "hocht", "i", "iad", "idir", "in", "ina", "ins", "inár", "is", "le", "leis", "lena", "lenár", "m'", "mar", "mo", "mé", "na", "nach", "naoi", "naonúr", "ná", "ní", "níor", "nó", "nócha", "ocht", "ochtar", "os", "roimh", "sa", "seacht", "seachtar", "seachtó", "seasca", "seisear", "siad", "sibh", "sinn", "sna", "sé", "sí", "tar", "thar", "thú", "triúr", "trí", "trína", "trínár", "tríocha", "tú", "um", "ár", "é", "éis", "í", "ó", "ón", "óna", "ónár" )) ITALIAN = frozenset(( "a", "abbastanza", "abbia", "abbiamo", "abbiano", "abbiate", "accidenti", "ad", "adesso", "affinché", "agl", "agli", "ahime", "ahimè", "ai", "al", "alcuna", "alcuni", "alcuno", "all", "alla", "alle", "allo", "allora", "altre", "altri", "altrimenti", "altro", "altrove", "altrui", "anche", "ancora", "anni", "anno", "ansa", "anticipo", "assai", "attesa", "attraverso", "avanti", "avemmo", "avendo", "avente", "aver", "avere", "averlo", "avesse", "avessero", "avessi", "avessimo", "aveste", "avesti", "avete", "aveva", "avevamo", "avevano", "avevate", "avevi", "avevo", "avrai", "avranno", "avrebbe", "avrebbero", "avrei", "avremmo", "avremo", "avreste", "avresti", "avrete", "avrà", "avrò", "avuta", "avute", "avuti", "avuto", "basta", "ben", "bene", "benissimo", "brava", "bravo", "buono", "c", "caso", "cento", "certa", "certe", "certi", "certo", "che", "chi", "chicchessia", "chiunque", "ci", "ciascuna", "ciascuno", "cima", "cinque", "cio", "cioe", "cioè", "circa", "citta", "città", "ciò", "co", "codesta", "codesti", "codesto", "cogli", "coi", "col", "colei", "coll", "coloro", "colui", "come", "cominci", "comprare", "comunque", "con", "concernente", "conclusione", "consecutivi", "consecutivo", "consiglio", "contro", "cortesia", "cos", "cosa", "cosi", "così", "cui", "d", "da", "dagl", "dagli", "dai", "dal", "dall", "dalla", "dalle", "dallo", "dappertutto", "davanti", "degl", "degli", "dei", "del", "dell", "della", "delle", "dello", "dentro", "detto", "deve", "devo", "di", "dice", "dietro", "dire", "dirimpetto", "diventa", "diventare", "diventato", "dopo", "doppio", "dov", "dove", "dovra", "dovrà", "dovunque", "due", "dunque", "durante", "e", "ebbe", "ebbero", "ebbi", "ecc", "ecco", "ed", "effettivamente", "egli", "ella", "entrambi", "eppure", "era", "erano", "eravamo", "eravate", "eri", "ero", "esempio", "esse", "essendo", "esser", "essere", "essi", "ex", "fa", "faccia", "facciamo", "facciano", "facciate", "faccio", "facemmo", "facendo", "facesse", "facessero", "facessi", "facessimo", "faceste", "facesti", "faceva", "facevamo", "facevano", "facevate", "facevi", "facevo", "fai", "fanno", "farai", "faranno", "fare", "farebbe", "farebbero", "farei", "faremmo", "faremo", "fareste", "faresti", "farete", "farà", "farò", "fatto", "favore", "fece", "fecero", "feci", "fin", "finalmente", "finche", "fine", "fino", "forse", "forza", "fosse", "fossero", "fossi", "fossimo", "foste", "fosti", "fra", "frattempo", "fu", "fui", "fummo", "fuori", "furono", "futuro", "generale", "gente", "gia", "giacche", "giorni", "giorno", "giu", "già", "gli", "gliela", "gliele", "glieli", "glielo", "gliene", "grande", "grazie", "gruppo", "ha", "haha", "hai", "hanno", "ho", "i", "ie", "ieri", "il", "improvviso", "in", "inc", "indietro", "infatti", "inoltre", "insieme", "intanto", "intorno", "invece", "io", "l", "la", "lasciato", "lato", "le", "lei", "li", "lo", "lontano", "loro", "lui", "lungo", "luogo", "là", "ma", "macche", "magari", "maggior", "mai", "male", "malgrado", "malissimo", "me", "medesimo", "mediante", "meglio", "meno", "mentre", "mesi", "mezzo", "mi", "mia", "mie", "miei", "mila", "miliardi", "milioni", "minimi", "mio", "modo", "molta", "molti", "moltissimo", "molto", "momento", "mondo", "ne", "negl", "negli", "nei", "nel", "nell", "nella", "nelle", "nello", "nemmeno", "neppure", "nessun", "nessuna", "nessuno", "niente", "no", "noi", "nome", "non", "nondimeno", "nonostante", "nonsia", "nostra", "nostre", "nostri", "nostro", "novanta", "nove", "nulla", "nuovi", "nuovo", "o", "od", "oggi", "ogni", "ognuna", "ognuno", "oltre", "oppure", "ora", "ore", "osi", "ossia", "ottanta", "otto", "paese", "parecchi", "parecchie", "parecchio", "parte", "partendo", "peccato", "peggio", "per", "perche", "perchè", "perché", "percio", "perciò", "perfino", "pero", "persino", "persone", "però", "piedi", "pieno", "piglia", "piu", "piuttosto", "più", "po", "pochissimo", "poco", "poi", "poiche", "possa", "possedere", "posteriore", "posto", "potrebbe", "preferibilmente", "presa", "press", "prima", "primo", "principalmente", "probabilmente", "promesso", "proprio", "puo", "pure", "purtroppo", "può", "qua", "qualche", "qualcosa", "qualcuna", "qualcuno", "quale", "quali", "qualunque", "quando", "quanta", "quante", "quanti", "quanto", "quantunque", "quarto", "quasi", "quattro", "quel", "quella", "quelle", "quelli", "quello", "quest", "questa", "queste", "questi", "questo", "qui", "quindi", "quinto", "realmente", "recente", "recentemente", "registrazione", "relativo", "riecco", "rispetto", "salvo", "sara", "sarai", "saranno", "sarebbe", "sarebbero", "sarei", "saremmo", "saremo", "sareste", "saresti", "sarete", "sarà", "sarò", "scola", "scopo", "scorso", "se", "secondo", "seguente", "seguito", "sei", "sembra", "sembrare", "sembrato", "sembrava", "sembri", "sempre", "senza", "sette", "si", "sia", "siamo", "siano", "siate", "siete", "sig", "solito", "solo", "soltanto", "sono", "sopra", "soprattutto", "sotto", "spesso", "sta", "stai", "stando", "stanno", "starai", "staranno", "starebbe", "starebbero", "starei", "staremmo", "staremo", "stareste", "staresti", "starete", "starà", "starò", "stata", "state", "stati", "stato", "stava", "stavamo", "stavano", "stavate", "stavi", "stavo", "stemmo", "stessa", "stesse", "stessero", "stessi", "stessimo", "stesso", "steste", "stesti", "stette", "stettero", "stetti", "stia", "stiamo", "stiano", "stiate", "sto", "su", "sua", "subito", "successivamente", "successivo", "sue", "sugl", "sugli", "sui", "sul", "sull", "sulla", "sulle", "sullo", "suo", "suoi", "tale", "tali", "talvolta", "tanto", "te", "tempo", "terzo", "th", "ti", "titolo", "tra", "tranne", "tre", "trenta", "triplo", "troppo", "trovato", "tu", "tua", "tue", "tuo", "tuoi", "tutta", "tuttavia", "tutte", "tutti", "tutto", "uguali", "ulteriore", "ultimo", "un", "una", "uno", "uomo", "va", "vai", "vale", "vari", "varia", "varie", "vario", "verso", "vi", "vicino", "visto", "vita", "voi", "volta", "volte", "vostra", "vostre", "vostri", "vostro", "è" )) LITHUANIAN = frozenset(( "abi", "abidvi", "abiejose", "abiejuose", "abiejø", "abiem", "abigaliai", "abipus", "abu", "abudu", "ai", "ana", "anaiptol", "anaisiais", "anajai", "anajam", "anajame", "anapus", "anas", "anasai", "anasis", "anei", "aniedvi", "anieji", "aniesiems", "anoji", "anojo", "anojoje", "anokia", "anoks", "anosiomis", "anosioms", "anosios", "anosiose", "anot", "ant", "antai", "anuodu", "anuoju", "anuosiuose", "anuosius", "anàja", "anàjà", "anàjá", "anàsias", "anøjø", "apie", "aplink", "ar", "arba", "argi", "arti", "aukðèiau", "að", "be", "bei", "beje", "bemaþ", "bent", "bet", "betgi", "beveik", "dar", "dargi", "daugmaþ", "deja", "dëka", "dël", "dëlei", "dëlto", "ech", "et", "gal", "galbût", "galgi", "gan", "gana", "gi", "greta", "idant", "iki", "ir", "irgi", "it", "itin", "ið", "iðilgai", "iðvis", "jaisiais", "jajai", "jajam", "jajame", "jei", "jeigu", "ji", "jiedu", "jiedvi", "jieji", "jiesiems", "jinai", "jis", "jisai", "jog", "joji", "jojo", "jojoje", "jokia", "joks", "josiomis", "josioms", "josios", "josiose", "judu", "judvi", "juk", "jumis", "jums", "jumyse", "juodu", "juoju", "juosiuose", "juosius", "jus", "jàja", "jàjà", "jàsias", "jájá", "jøjø", "jûs", "jûsiðkis", "jûsiðkë", "jûsø", "kad", "kada", "kadangi", "kai", "kaip", "kaipgi", "kas", "katra", "katras", "katriedvi", "katruodu", "kaþin", "kaþkas", "kaþkatra", "kaþkatras", "kaþkokia", "kaþkoks", "kaþkuri", "kaþkuris", "kiaurai", "kiek", "kiekvienas", "kieno", "kita", "kitas", "kitokia", "kitoks", "kodël", "kokia", "koks", "kol", "kolei", "kone", "kuomet", "kur", "kurgi", "kuri", "kuriedvi", "kuris", "kuriuodu", "lai", "lig", "ligi", "link", "lyg", "man", "manaisiais", "manajai", "manajam", "manajame", "manas", "manasai", "manasis", "mane", "manieji", "maniesiems", "manim", "manimi", "maniðkis", "maniðkë", "mano", "manoji", "manojo", "manojoje", "manosiomis", "manosioms", "manosios", "manosiose", "manuoju", "manuosiuose", "manuosius", "manyje", "manàja", "manàjà", "manàjá", "manàsias", "manæs", "manøjø", "mat", "maþdaug", "maþne", "mes", "mudu", "mudvi", "mumis", "mums", "mumyse", "mus", "mûsiðkis", "mûsiðkë", "mûsø", "na", "nagi", "ne", "nebe", "nebent", "negi", "negu", "nei", "nejau", "nejaugi", "nekaip", "nelyginant", "nes", "net", "netgi", "netoli", "neva", "nors", "nuo", "në", "o", "ogi", "oi", "paeiliui", "pagal", "pakeliui", "palaipsniui", "palei", "pas", "pasak", "paskos", "paskui", "paskum", "pat", "pati", "patiems", "paties", "pats", "patys", "patá", "paèiais", "paèiam", "paèiame", "paèiu", "paèiuose", "paèius", "paèiø", "per", "pernelyg", "pirm", "pirma", "pirmiau", "po", "prie", "prieð", "prieðais", "pro", "pusiau", "rasi", "rodos", "sau", "savaisiais", "savajai", "savajam", "savajame", "savas", "savasai", "savasis", "save", "savieji", "saviesiems", "savimi", "saviðkis", "saviðkë", "savo", "savoji", "savojo", "savojoje", "savosiomis", "savosioms", "savosios", "savosiose", "savuoju", "savuosiuose", "savuosius", "savyje", "savàja", "savàjà", "savàjá", "savàsias", "savæs", "savøjø", "skersai", "skradþiai", "staèiai", "su", "sulig", "ta", "tad", "tai", "taigi", "taip", "taipogi", "taisiais", "tajai", "tajam", "tajame", "tamsta", "tarp", "tarsi", "tartum", "tarytum", "tas", "tasai", "tau", "tavaisiais", "tavajai", "tavajam", "tavajame", "tavas", "tavasai", "tavasis", "tave", "tavieji", "taviesiems", "tavimi", "taviðkis", "taviðkë", "tavo", "tavoji", "tavojo", "tavojoje", "tavosiomis", "tavosioms", "tavosios", "tavosiose", "tavuoju", "tavuosiuose", "tavuosius", "tavyje", "tavàja", "tavàjà", "tavàjá", "tavàsias", "tavæs", "tavøjø", "taèiau", "te", "tegu", "tegul", "tiedvi", "tieji", "ties", "tiesiems", "tiesiog", "tik", "tikriausiai", "tiktai", "toji", "tojo", "tojoje", "tokia", "toks", "tol", "tolei", "toliau", "tosiomis", "tosioms", "tosios", "tosiose", "tu", "tuodu", "tuoju", "tuosiuose", "tuosius", "turbût", "tàja", "tàjà", "tàjá", "tàsias", "tøjø", "tûlas", "uþ", "uþtat", "uþvis", "va", "vai", "viduj", "vidury", "vien", "vienas", "vienokia", "vienoks", "vietoj", "virð", "virðuj", "virðum", "vis", "vis dëlto", "visa", "visas", "visgi", "visokia", "visoks", "vos", "vël", "vëlgi", "ypaè", "á", "ákypai", "ástriþai", "ðalia", "ðe", "ði", "ðiaisiais", "ðiajai", "ðiajam", "ðiajame", "ðiapus", "ðiedvi", "ðieji", "ðiesiems", "ðioji", "ðiojo", "ðiojoje", "ðiokia", "ðioks", "ðiosiomis", "ðiosioms", "ðiosios", "ðiosiose", "ðis", "ðisai", "ðit", "ðita", "ðitas", "ðitiedvi", "ðitokia", "ðitoks", "ðituodu", "ðiuodu", "ðiuoju", "ðiuosiuose", "ðiuosius", "ðiàja", "ðiàjà", "ðiàsias", "ðiøjø", "ðtai", "ðájá", "þemiau" )) NEPALI = frozenset(( "अक्सर", "अगाडि", "अझै", "अनुसार", "अन्तर्गत", "अन्य", "अन्यत्र", "अन्यथा", "अब", "अरू", "अरूलाई", "अर्को", "अर्थात", "अर्थात्", "अलग", "आए", "आजको", "आठ", "आत्म", "आदि", "आफू", "आफूलाई", "आफैलाई", "आफ्नै", "आफ्नो", "आयो", "उदाहरण", "उन", "उनको", "उनले", "उप", "उहाँलाई", "एउटै", "एक", "एकदम", "औं", "कतै", "कम", "कसरी", "कसै", "कसैले", "कहाँबाट", "कहिलेकाहीं", "कहिल्यै", "कहीं", "का", "कि", "किन", "किनभने", "कुनै", "कुरा", "कृपया", "के", "केहि", "केही", "को", "कोही", "क्रमशः", "गए", "गरि", "गरी", "गरेका", "गरेको", "गरेर", "गरौं", "गर्छ", "गर्छु", "गर्दै", "गर्न", "गर्नु", "गर्नुपर्छ", "गर्ने", "गर्यौं", "गैर", "चाँडै", "चार", "चाले", "चाहनुहुन्छ", "चाहन्छु", "चाहिए", "छ", "छन्", "छु", "छैन", "छौँ", "छौं", "जताततै", "जब", "जबकि", "जसको", "जसबाट", "जसमा", "जसलाई", "जसले", "जस्तै", "जस्तो", "जस्तोसुकै", "जहाँ", "जान", "जाहिर", "जुन", "जे", "जो", "ठीक", "त", "तत्काल", "तथा", "तदनुसार", "तपाइँको", "तपाईं", "तर", "तल", "तापनि", "तिनी", "तिनीहरू", "तिनीहरूको", "तिनीहरूलाई", "तिनीहरूले", "तिमी", "तिर", "ती", "तीन", "तुरुन्तै", "तेस्रो", "त्यसकारण", "त्यसपछि", "त्यसमा", "त्यसैले", "त्यहाँ", "त्यो", "थिए", "थिएन", "थिएनन्", "थियो", "दिए", "दिनुभएको", "दिनुहुन्छ", "दुई", "देख", "देखि", "देखिन्छ", "देखियो", "देखे", "देखेको", "देखेर", "देख्न", "दोश्रो", "दोस्रो", "धेरै", "न", "नजिकै", "नत्र", "नयाँ", "नि", "निम्ति", "निम्न", "निम्नानुसार", "निर्दिष्ट", "नै", "नौ", "पक्का", "पक्कै", "पछि", "पछिल्लो", "पटक", "पनि", "पर्छ", "पर्थ्यो", "पर्याप्त", "पहिले", "पहिलो", "पहिल्यै", "पाँच", "पाँचौं", "पूर्व", "प्रति", "प्रत्येक", "प्लस", "फेरि", "बने", "बन्द", "बन्न", "बरु", "बाटो", "बारे", "बाहिर", "बाहेक", "बीच", "बीचमा", "भए", "भएको", "भन", "भने", "भने्", "भन्छन्", "भन्छु", "भन्दा", "भन्नुभयो", "भन्ने", "भर", "भित्र", "भित्री", "म", "मलाई", "मा", "मात्र", "माथि", "मुख्य", "मेरो", "यति", "यथोचित", "यदि", "यद्यपि", "यस", "यसको", "यसपछि", "यसबाहेक", "यसरी", "यसो", "यस्तो", "यहाँ", "यहाँसम्म", "या", "यी", "यो", "र", "रही", "रहेका", "रहेको", "राखे", "राख्छ", "राम्रो", "रूप", "लगभग", "लाई", "लागि", "ले", "वरिपरि", "वास्तवमा", "वाहेक", "विरुद्ध", "विशेष", "शायद", "सँग", "सँगै", "सक्छ", "सट्टा", "सधैं", "सबै", "सबैलाई", "समय", "सम्भव", "सम्म", "सही", "साँच्चै", "सात", "साथ", "साथै", "सायद", "सारा", "से", "सो", "सोध्न", "सोही", "स्पष्ट", "हरे", "हरेक", "हामी", "हामीलाई", "हाम्रो", "हुँ", "हुन", "हुने", "हुनेछ", "हुन्", "हुन्छ", "हो", "होइन", "होइनन्", "होला", "होस्" )) NORWEGIAN = frozenset(( "alle", "andre", "arbeid", "at", "av", "bare", "begge", "ble", "blei", "bli", "blir", "blitt", "bort", "bra", "bruke", "både", "båe", "da", "de", "deg", "dei", "deim", "deira", "deires", "dem", "den", "denne", "der", "dere", "deres", "det", "dette", "di", "din", "disse", "ditt", "du", "dykk", "dykkar", "då", "eg", "ein", "eit", "eitt", "eller", "elles", "en", "ene", "eneste", "enhver", "enn", "er", "et", "ett", "etter", "folk", "for", "fordi", "forsûke", "fra", "få", "før", "fûr", "fûrst", "gjorde", "gjûre", "god", "gå", "ha", "hadde", "han", "hans", "har", "hennar", "henne", "hennes", "her", "hjå", "ho", "hoe", "honom", "hoss", "hossen", "hun", "hva", "hvem", "hver", "hvilke", "hvilken", "hvis", "hvor", "hvordan", "hvorfor", "i", "ikke", "ikkje", "ingen", "ingi", "inkje", "inn", "innen", "inni", "ja", "jeg", "kan", "kom", "korleis", "korso", "kun", "kunne", "kva", "kvar", "kvarhelst", "kven", "kvi", "kvifor", "lage", "lang", "lik", "like", "makt", "man", "mange", "me", "med", "medan", "meg", "meget", "mellom", "men", "mens", "mer", "mest", "mi", "min", "mine", "mitt", "mot", "mye", "mykje", "må", "måte", "navn", "ned", "nei", "no", "noe", "noen", "noka", "noko", "nokon", "nokor", "nokre", "ny", "nå", "når", "og", "også", "om", "opp", "oss", "over", "part", "punkt", "på", "rett", "riktig", "samme", "sant", "seg", "selv", "si", "sia", "sidan", "siden", "sin", "sine", "sist", "sitt", "sjøl", "skal", "skulle", "slik", "slutt", "so", "som", "somme", "somt", "start", "stille", "så", "sånn", "tid", "til", "tilbake", "tilstand", "um", "under", "upp", "ut", "uten", "var", "vart", "varte", "ved", "verdi", "vere", "verte", "vi", "vil", "ville", "vite", "vore", "vors", "vort", "vår", "være", "vært", "vöre", "vört", "å" )) PORTUGUESE = frozenset(( "a", "acerca", "adeus", "agora", "ainda", "alem", "algmas", "algo", "algumas", "alguns", "ali", "além", "ambas", "ambos", "ano", "anos", "antes", "ao", "aonde", "aos", "apenas", "apoio", "apontar", "apos", "após", "aquela", "aquelas", "aquele", "aqueles", "aqui", "aquilo", "as", "assim", "através", "atrás", "até", "aí", "baixo", "bastante", "bem", "boa", "boas", "bom", "bons", "breve", "cada", "caminho", "catorze", "cedo", "cento", "certamente", "certeza", "cima", "cinco", "coisa", "com", "como", "comprido", "conhecido", "conselho", "contra", "contudo", "corrente", "cuja", "cujas", "cujo", "cujos", "custa", "cá", "da", "daquela", "daquelas", "daquele", "daqueles", "dar", "das", "de", "debaixo", "dela", "delas", "dele", "deles", "demais", "dentro", "depois", "desde", "desligado", "dessa", "dessas", "desse", "desses", "desta", "destas", "deste", "destes", "deve", "devem", "deverá", "dez", "dezanove", "dezasseis", "dezassete", "dezoito", "dia", "diante", "direita", "dispoe", "dispoem", "diversa", "diversas", "diversos", "diz", "dizem", "dizer", "do", "dois", "dos", "doze", "duas", "durante", "dá", "dão", "dúvida", "e", "ela", "elas", "ele", "eles", "em", "embora", "enquanto", "entao", "entre", "então", "era", "eram", "essa", "essas", "esse", "esses", "esta", "estado", "estamos", "estar", "estará", "estas", "estava", "estavam", "este", "esteja", "estejam", "estejamos", "estes", "esteve", "estive", "estivemos", "estiver", "estivera", "estiveram", "estiverem", "estivermos", "estivesse", "estivessem", "estiveste", "estivestes", "estivéramos", "estivéssemos", "estou", "está", "estás", "estávamos", "estão", "eu", "exemplo", "falta", "fará", "favor", "faz", "fazeis", "fazem", "fazemos", "fazer", "fazes", "fazia", "faço", "fez", "fim", "final", "foi", "fomos", "for", "fora", "foram", "forem", "forma", "formos", "fosse", "fossem", "foste", "fostes", "fui", "fôramos", "fôssemos", "geral", "grande", "grandes", "grupo", "ha", "haja", "hajam", "hajamos", "havemos", "havia", "hei", "hoje", "hora", "horas", "houve", "houvemos", "houver", "houvera", "houveram", "houverei", "houverem", "houveremos", "houveria", "houveriam", "houvermos", "houverá", "houverão", "houveríamos", "houvesse", "houvessem", "houvéramos", "houvéssemos", "há", "hão", "iniciar", "inicio", "ir", "irá", "isso", "ista", "iste", "isto", "já", "lado", "lhe", "lhes", "ligado", "local", "logo", "longe", "lugar", "lá", "maior", "maioria", "maiorias", "mais", "mal", "mas", "me", "mediante", "meio", "menor", "menos", "meses", "mesma", "mesmas", "mesmo", "mesmos", "meu", "meus", "mil", "minha", "minhas", "momento", "muito", "muitos", "máximo", "mês", "na", "nada", "nao", "naquela", "naquelas", "naquele", "naqueles", "nas", "nem", "nenhuma", "nessa", "nessas", "nesse", "nesses", "nesta", "nestas", "neste", "nestes", "no", "noite", "nome", "nos", "nossa", "nossas", "nosso", "nossos", "nova", "novas", "nove", "novo", "novos", "num", "numa", "numas", "nunca", "nuns", "não", "nível", "nós", "número", "o", "obra", "obrigada", "obrigado", "oitava", "oitavo", "oito", "onde", "ontem", "onze", "os", "ou", "outra", "outras", "outro", "outros", "para", "parece", "parte", "partir", "paucas", "pegar", "pela", "pelas", "pelo", "pelos", "perante", "perto", "pessoas", "pode", "podem", "poder", "poderá", "podia", "pois", "ponto", "pontos", "por", "porque", "porquê", "portanto", "posição", "possivelmente", "posso", "possível", "pouca", "pouco", "poucos", "povo", "primeira", "primeiras", "primeiro", "primeiros", "promeiro", "propios", "proprio", "própria", "próprias", "próprio", "próprios", "próxima", "próximas", "próximo", "próximos", "puderam", "pôde", "põe", "põem", "quais", "qual", "qualquer", "quando", "quanto", "quarta", "quarto", "quatro", "que", "quem", "quer", "quereis", "querem", "queremas", "queres", "quero", "questão", "quieto", "quinta", "quinto", "quinze", "quáis", "quê", "relação", "sabe", "sabem", "saber", "se", "segunda", "segundo", "sei", "seis", "seja", "sejam", "sejamos", "sem", "sempre", "sendo", "ser", "serei", "seremos", "seria", "seriam", "será", "serão", "seríamos", "sete", "seu", "seus", "sexta", "sexto", "sim", "sistema", "sob", "sobre", "sois", "somente", "somos", "sou", "sua", "suas", "são", "sétima", "sétimo", "só", "tal", "talvez", "tambem", "também", "tanta", "tantas", "tanto", "tarde", "te", "tem", "temos", "tempo", "tendes", "tenha", "tenham", "tenhamos", "tenho", "tens", "tentar", "tentaram", "tente", "tentei", "ter", "terceira", "terceiro", "terei", "teremos", "teria", "teriam", "terá", "terão", "teríamos", "teu", "teus", "teve", "tinha", "tinham", "tipo", "tive", "tivemos", "tiver", "tivera", "tiveram", "tiverem", "tivermos", "tivesse", "tivessem", "tiveste", "tivestes", "tivéramos", "tivéssemos", "toda", "todas", "todo", "todos", "trabalhar", "trabalho", "treze", "três", "tu", "tua", "tuas", "tudo", "tão", "tém", "têm", "tínhamos", "um", "uma", "umas", "uns", "usa", "usar", "vai", "vais", "valor", "veja", "vem", "vens", "ver", "verdade", "verdadeiro", "vez", "vezes", "viagem", "vindo", "vinte", "você", "vocês", "vos", "vossa", "vossas", "vosso", "vossos", "vários", "vão", "vêm", "vós", "zero", "à", "às", "área", "é", "éramos", "és", "último" )) ROMANIAN = frozenset(( "a", "abia", "acea", "aceasta", "această", "aceea", "aceeasi", "acei", "aceia", "acel", "acela", "acelasi", "acele", "acelea", "acest", "acesta", "aceste", "acestea", "acestei", "acestia", "acestui", "aceşti", "aceştia", "acolo", "acord", "acum", "adica", "ai", "aia", "aibă", "aici", "aiurea", "al", "ala", "alaturi", "ale", "alea", "alt", "alta", "altceva", "altcineva", "alte", "altfel", "alti", "altii", "altul", "am", "anume", "apoi", "ar", "are", "as", "asa", "asemenea", "asta", "astazi", "astea", "astfel", "astăzi", "asupra", "atare", "atat", "atata", "atatea", "atatia", "ati", "atit", "atita", "atitea", "atitia", "atunci", "au", "avea", "avem", "aveţi", "avut", "azi", "aş", "aşadar", "aţi", "b", "ba", "bine", "bucur", "bună", "c", "ca", "cam", "cand", "capat", "care", "careia", "carora", "caruia", "cat", "catre", "caut", "ce", "cea", "ceea", "cei", "ceilalti", "cel", "cele", "celor", "ceva", "chiar", "ci", "cinci", "cind", "cine", "cineva", "cit", "cita", "cite", "citeva", "citi", "citiva", "conform", "contra", "cu", "cui", "cum", "cumva", "curând", "curînd", "când", "cât", "câte", "câtva", "câţi", "cînd", "cît", "cîte", "cîtva", "cîţi", "că", "căci", "cărei", "căror", "cărui", "către", "d", "da", "daca", "dacă", "dar", "dat", "datorită", "dată", "dau", "de", "deasupra", "deci", "decit", "degraba", "deja", "deoarece", "departe", "desi", "despre", "deşi", "din", "dinaintea", "dintr", "dintr-", "dintre", "doar", "doi", "doilea", "două", "drept", "dupa", "după", "dă", "e", "ea", "ei", "el", "ele", "era", "eram", "este", "eu", "exact", "eşti", "f", "face", "fara", "fata", "fel", "fi", "fie", "fiecare", "fii", "fim", "fiu", "fiţi", "foarte", "fost", "frumos", "fără", "g", "geaba", "graţie", "h", "halbă", "i", "ia", "iar", "ieri", "ii", "il", "imi", "in", "inainte", "inapoi", "inca", "incit", "insa", "intr", "intre", "isi", "iti", "j", "k", "l", "la", "le", "li", "lor", "lui", "lângă", "lîngă", "m", "ma", "mai", "mare", "mea", "mei", "mele", "mereu", "meu", "mi", "mie", "mine", "mod", "mult", "multa", "multe", "multi", "multă", "mulţi", "mulţumesc", "mâine", "mîine", "mă", "n", "ne", "nevoie", "ni", "nici", "niciodata", "nicăieri", "nimeni", "nimeri", "nimic", "niste", "nişte", "noastre", "noastră", "noi", "noroc", "nostri", "nostru", "nou", "noua", "nouă", "noştri", "nu", "numai", "o", "opt", "or", "ori", "oricare", "orice", "oricine", "oricum", "oricând", "oricât", "oricînd", "oricît", "oriunde", "p", "pai", "parca", "patra", "patru", "patrulea", "pe", "pentru", "peste", "pic", "pina", "plus", "poate", "pot", "prea", "prima", "primul", "prin", "printr-", "putini", "puţin", "puţina", "puţină", "până", "pînă", "r", "rog", "s", "sa", "sa-mi", "sa-ti", "sai", "sale", "sau", "se", "si", "sint", "sintem", "spate", "spre", "sub", "sunt", "suntem", "sunteţi", "sus", "sută", "sînt", "sîntem", "sînteţi", "să", "săi", "său", "t", "ta", "tale", "te", "ti", "timp", "tine", "toata", "toate", "toată", "tocmai", "tot", "toti", "totul", "totusi", "totuşi", "toţi", "trei", "treia", "treilea", "tu", "tuturor", "tăi", "tău", "u", "ul", "ului", "un", "una", "unde", "undeva", "unei", "uneia", "unele", "uneori", "unii", "unor", "unora", "unu", "unui", "unuia", "unul", "v", "va", "vi", "voastre", "voastră", "voi", "vom", "vor", "vostru", "vouă", "voştri", "vreme", "vreo", "vreun", "vă", "x", "z", "zece", "zero", "zi", "zice", "îi", "îl", "îmi", "împotriva", "în", "înainte", "înaintea", "încotro", "încât", "încît", "între", "întrucât", "întrucît", "îţi", "ăla", "ălea", "ăsta", "ăstea", "ăştia", "şapte", "şase", "şi", "ştiu", "ţi", "ţie" )) RUSSIAN = frozenset(( "c", "а", "алло", "без", "белый", "близко", "более", "больше", "большой", "будем", "будет", "будете", "будешь", "будто", "буду", "будут", "будь", "бы", "бывает", "бывь", "был", "была", "были", "было", "быть", "в", "важная", "важное", "важные", "важный", "вам", "вами", "вас", "ваш", "ваша", "ваше", "ваши", "вверх", "вдали", "вдруг", "ведь", "везде", "вернуться", "весь", "вечер", "взгляд", "взять", "вид", "видел", "видеть", "вместе", "вне", "вниз", "внизу", "во", "вода", "война", "вокруг", "вон", "вообще", "вопрос", "восемнадцатый", "восемнадцать", "восемь", "восьмой", "вот", "впрочем", "времени", "время", "все", "все еще", "всегда", "всего", "всем", "всеми", "всему", "всех", "всею", "всю", "всюду", "вся", "всё", "второй", "вы", "выйти", "г", "где", "главный", "глаз", "говорил", "говорит", "говорить", "год", "года", "году", "голова", "голос", "город", "да", "давать", "давно", "даже", "далекий", "далеко", "дальше", "даром", "дать", "два", "двадцатый", "двадцать", "две", "двенадцатый", "двенадцать", "дверь", "двух", "девятнадцатый", "девятнадцать", "девятый", "девять", "действительно", "дел", "делал", "делать", "делаю", "дело", "день", "деньги", "десятый", "десять", "для", "до", "довольно", "долго", "должен", "должно", "должный", "дом", "дорога", "друг", "другая", "другие", "других", "друго", "другое", "другой", "думать", "душа", "е", "его", "ее", "ей", "ему", "если", "есть", "еще", "ещё", "ею", "её", "ж", "ждать", "же", "жена", "женщина", "жизнь", "жить", "за", "занят", "занята", "занято", "заняты", "затем", "зато", "зачем", "здесь", "земля", "знать", "значит", "значить", "и", "иди", "идти", "из", "или", "им", "имеет", "имел", "именно", "иметь", "ими", "имя", "иногда", "их", "к", "каждая", "каждое", "каждые", "каждый", "кажется", "казаться", "как", "какая", "какой", "кем", "книга", "когда", "кого", "ком", "комната", "кому", "конец", "конечно", "которая", "которого", "которой", "которые", "который", "которых", "кроме", "кругом", "кто", "куда", "лежать", "лет", "ли", "лицо", "лишь", "лучше", "любить", "люди", "м", "маленький", "мало", "мать", "машина", "между", "меля", "менее", "меньше", "меня", "место", "миллионов", "мимо", "минута", "мир", "мира", "мне", "много", "многочисленная", "многочисленное", "многочисленные", "многочисленный", "мной", "мною", "мог", "могу", "могут", "мож", "может", "может быть", "можно", "можхо", "мои", "мой", "мор", "москва", "мочь", "моя", "моё", "мы", "на", "наверху", "над", "надо", "назад", "наиболее", "найти", "наконец", "нам", "нами", "народ", "нас", "начала", "начать", "наш", "наша", "наше", "наши", "не", "него", "недавно", "недалеко", "нее", "ней", "некоторый", "нельзя", "нем", "немного", "нему", "непрерывно", "нередко", "несколько", "нет", "нею", "неё", "ни", "нибудь", "ниже", "низко", "никакой", "никогда", "никто", "никуда", "ним", "ними", "них", "ничего", "ничто", "но", "новый", "нога", "ночь", "ну", "нужно", "нужный", "нх", "о", "об", "оба", "обычно", "один", "одиннадцатый", "одиннадцать", "однажды", "однако", "одного", "одной", "оказаться", "окно", "около", "он", "она", "они", "оно", "опять", "особенно", "остаться", "от", "ответить", "отец", "откуда", "отовсюду", "отсюда", "очень", "первый", "перед", "писать", "плечо", "по", "под", "подойди", "подумать", "пожалуйста", "позже", "пойти", "пока", "пол", "получить", "помнить", "понимать", "понять", "пор", "пора", "после", "последний", "посмотреть", "посреди", "потом", "потому", "почему", "почти", "правда", "прекрасно", "при", "про", "просто", "против", "процентов", "путь", "пятнадцатый", "пятнадцать", "пятый", "пять", "работа", "работать", "раз", "разве", "рано", "раньше", "ребенок", "решить", "россия", "рука", "русский", "ряд", "рядом", "с", "с кем", "сам", "сама", "сами", "самим", "самими", "самих", "само", "самого", "самой", "самом", "самому", "саму", "самый", "свет", "свое", "своего", "своей", "свои", "своих", "свой", "свою", "сделать", "сеаой", "себе", "себя", "сегодня", "седьмой", "сейчас", "семнадцатый", "семнадцать", "семь", "сидеть", "сила", "сих", "сказал", "сказала", "сказать", "сколько", "слишком", "слово", "случай", "смотреть", "сначала", "снова", "со", "собой", "собою", "советский", "совсем", "спасибо", "спросить", "сразу", "стал", "старый", "стать", "стол", "сторона", "стоять", "страна", "суть", "считать", "т", "та", "так", "такая", "также", "таки", "такие", "такое", "такой", "там", "твои", "твой", "твоя", "твоё", "те", "тебе", "тебя", "тем", "теми", "теперь", "тех", "то", "тобой", "тобою", "товарищ", "тогда", "того", "тоже", "только", "том", "тому", "тот", "тою", "третий", "три", "тринадцатый", "тринадцать", "ту", "туда", "тут", "ты", "тысяч", "у", "увидеть", "уж", "уже", "улица", "уметь", "утро", "хороший", "хорошо", "хотел бы", "хотеть", "хоть", "хотя", "хочешь", "час", "часто", "часть", "чаще", "чего", "человек", "чем", "чему", "через", "четвертый", "четыре", "четырнадцатый", "четырнадцать", "что", "чтоб", "чтобы", "чуть", "шестнадцатый", "шестнадцать", "шестой", "шесть", "эта", "эти", "этим", "этими", "этих", "это", "этого", "этой", "этом", "этому", "этот", "эту", "я", "являюсь" )) SERBIAN = frozenset(( "a", "ako", "ali", "bi", "bih", "bila", "bili", "bilo", "bio", "bismo", "biste", "biti", "bumo", "da", "do", "duž", "ga", "hoće", "hoćemo", "hoćete", "hoćeš", "hoću", "i", "iako", "ih", "ili", "iz", "ja", "je", "jedna", "jedne", "jedno", "jer", "jesam", "jesi", "jesmo", "jest", "jeste", "jesu", "jim", "joj", "još", "ju", "kada", "kako", "kao", "koja", "koje", "koji", "kojima", "koju", "kroz", "li", "me", "mene", "meni", "mi", "mimo", "moj", "moja", "moje", "mu", "na", "nad", "nakon", "nam", "nama", "nas", "naš", "naša", "naše", "našeg", "ne", "nego", "neka", "neki", "nekog", "neku", "nema", "netko", "neće", "nećemo", "nećete", "nećeš", "neću", "nešto", "ni", "nije", "nikoga", "nikoje", "nikoju", "nisam", "nisi", "nismo", "niste", "nisu", "njega", "njegov", "njegova", "njegovo", "njemu", "njezin", "njezina", "njezino", "njih", "njihov", "njihova", "njihovo", "njim", "njima", "njoj", "nju", "no", "o", "od", "odmah", "on", "ona", "oni", "ono", "ova", "pa", "pak", "po", "pod", "pored", "prije", "s", "sa", "sam", "samo", "se", "sebe", "sebi", "si", "smo", "ste", "su", "sve", "svi", "svog", "svoj", "svoja", "svoje", "svom", "ta", "tada", "taj", "tako", "te", "tebe", "tebi", "ti", "to", "toj", "tome", "tu", "tvoj", "tvoja", "tvoje", "u", "uz", "vam", "vama", "vas", "vaš", "vaša", "vaše", "već", "vi", "vrlo", "za", "zar", "će", "ćemo", "ćete", "ćeš", "ću", "što" )) SPANISH = frozenset(( "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "_", "a", "actualmente", "acuerdo", "adelante", "ademas", "además", "adrede", "afirmó", "agregó", "ahi", "ahora", "ahí", "al", "algo", "alguna", "algunas", "alguno", "algunos", "algún", "alli", "allí", "alrededor", "ambos", "ampleamos", "antano", "antaño", "ante", "anterior", "antes", "apenas", "aproximadamente", "aquel", "aquella", "aquellas", "aquello", "aquellos", "aqui", "aquél", "aquélla", "aquéllas", "aquéllos", "aquí", "arriba", "arribaabajo", "aseguró", "asi", "así", "atras", "aun", "aunque", "ayer", "añadió", "aún", "b", "bajo", "bastante", "bien", "breve", "buen", "buena", "buenas", "bueno", "buenos", "c", "cada", "casi", "cerca", "cierta", "ciertas", "cierto", "ciertos", "cinco", "claro", "comentó", "como", "con", "conmigo", "conocer", "conseguimos", "conseguir", "considera", "consideró", "consigo", "consigue", "consiguen", "consigues", "contigo", "contra", "cosas", "creo", "cual", "cuales", "cualquier", "cuando", "cuanta", "cuantas", "cuanto", "cuantos", "cuatro", "cuenta", "cuál", "cuáles", "cuándo", "cuánta", "cuántas", "cuánto", "cuántos", "cómo", "d", "da", "dado", "dan", "dar", "de", "debajo", "debe", "deben", "debido", "decir", "dejó", "del", "delante", "demasiado", "demás", "dentro", "deprisa", "desde", "despacio", "despues", "después", "detras", "detrás", "dia", "dias", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "donde", "dos", "durante", "día", "días", "dónde", "e", "ejemplo", "el", "ella", "ellas", "ello", "ellos", "embargo", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "encuentra", "enfrente", "enseguida", "entonces", "entre", "era", "erais", "eramos", "eran", "eras", "eres", "es", "esa", "esas", "ese", "eso", "esos", "esta", "estaba", "estabais", "estaban", "estabas", "estad", "estada", "estadas", "estado", "estados", "estais", "estamos", "estan", "estando", "estar", "estaremos", "estará", "estarán", "estarás", "estaré", "estaréis", "estaría", "estaríais", "estaríamos", "estarían", "estarías", "estas", "este", "estemos", "esto", "estos", "estoy", "estuve", "estuviera", "estuvierais", "estuvieran", "estuvieras", "estuvieron", "estuviese", "estuvieseis", "estuviesen", "estuvieses", "estuvimos", "estuviste", "estuvisteis", "estuviéramos", "estuviésemos", "estuvo", "está", "estábamos", "estáis", "están", "estás", "esté", "estéis", "estén", "estés", "ex", "excepto", "existe", "existen", "explicó", "expresó", "f", "fin", "final", "fue", "fuera", "fuerais", "fueran", "fueras", "fueron", "fuese", "fueseis", "fuesen", "fueses", "fui", "fuimos", "fuiste", "fuisteis", "fuéramos", "fuésemos", "g", "general", "gran", "grandes", "gueno", "h", "ha", "haber", "habia", "habida", "habidas", "habido", "habidos", "habiendo", "habla", "hablan", "habremos", "habrá", "habrán", "habrás", "habré", "habréis", "habría", "habríais", "habríamos", "habrían", "habrías", "habéis", "había", "habíais", "habíamos", "habían", "habías", "hace", "haceis", "hacemos", "hacen", "hacer", "hacerlo", "haces", "hacia", "haciendo", "hago", "han", "has", "hasta", "hay", "haya", "hayamos", "hayan", "hayas", "hayáis", "he", "hecho", "hemos", "hicieron", "hizo", "horas", "hoy", "hube", "hubiera", "hubierais", "hubieran", "hubieras", "hubieron", "hubiese", "hubieseis", "hubiesen", "hubieses", "hubimos", "hubiste", "hubisteis", "hubiéramos", "hubiésemos", "hubo", "i", "igual", "incluso", "indicó", "informo", "informó", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "ir", "j", "junto", "k", "l", "la", "lado", "largo", "las", "le", "lejos", "les", "llegó", "lleva", "llevar", "lo", "los", "luego", "lugar", "m", "mal", "manera", "manifestó", "mas", "mayor", "me", "mediante", "medio", "mejor", "mencionó", "menos", "menudo", "mi", "mia", "mias", "mientras", "mio", "mios", "mis", "misma", "mismas", "mismo", "mismos", "modo", "momento", "mucha", "muchas", "mucho", "muchos", "muy", "más", "mí", "mía", "mías", "mío", "míos", "n", "nada", "nadie", "ni", "ninguna", "ningunas", "ninguno", "ningunos", "ningún", "no", "nos", "nosotras", "nosotros", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o", "ocho", "os", "otra", "otras", "otro", "otros", "p", "pais", "para", "parece", "parte", "partir", "pasada", "pasado", "paìs", "peor", "pero", "pesar", "poca", "pocas", "poco", "pocos", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "podrá", "podrán", "podría", "podrían", "poner", "por", "por qué", "porque", "posible", "primer", "primera", "primero", "primeros", "principalmente", "pronto", "propia", "propias", "propio", "propios", "proximo", "próximo", "próximos", "pudo", "pueda", "puede", "pueden", "puedo", "pues", "q", "qeu", "que", "quedó", "queremos", "quien", "quienes", "quiere", "quiza", "quizas", "quizá", "quizás", "quién", "quiénes", "qué", "r", "raras", "realizado", "realizar", "realizó", "repente", "respecto", "s", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "sal", "salvo", "se", "sea", "seamos", "sean", "seas", "segun", "segunda", "segundo", "según", "seis", "ser", "sera", "seremos", "será", "serán", "serás", "seré", "seréis", "sería", "seríais", "seríamos", "serían", "serías", "seáis", "señaló", "si", "sido", "siempre", "siendo", "siete", "sigue", "siguiente", "sin", "sino", "sobre", "sois", "sola", "solamente", "solas", "solo", "solos", "somos", "son", "soy", "soyos", "su", "supuesto", "sus", "suya", "suyas", "suyo", "suyos", "sé", "sí", "sólo", "t", "tal", "tambien", "también", "tampoco", "tan", "tanto", "tarde", "te", "temprano", "tendremos", "tendrá", "tendrán", "tendrás", "tendré", "tendréis", "tendría", "tendríais", "tendríamos", "tendrían", "tendrías", "tened", "teneis", "tenemos", "tener", "tenga", "tengamos", "tengan", "tengas", "tengo", "tengáis", "tenida", "tenidas", "tenido", "tenidos", "teniendo", "tenéis", "tenía", "teníais", "teníamos", "tenían", "tenías", "tercera", "ti", "tiempo", "tiene", "tienen", "tienes", "toda", "todas", "todavia", "todavía", "todo", "todos", "total", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "trata", "través", "tres", "tu", "tus", "tuve", "tuviera", "tuvierais", "tuvieran", "tuvieras", "tuvieron", "tuviese", "tuvieseis", "tuviesen", "tuvieses", "tuvimos", "tuviste", "tuvisteis", "tuviéramos", "tuviésemos", "tuvo", "tuya", "tuyas", "tuyo", "tuyos", "tú", "u", "ultimo", "un", "una", "unas", "uno", "unos", "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "usted", "ustedes", "v", "va", "vais", "valor", "vamos", "van", "varias", "varios", "vaya", "veces", "ver", "verdad", "verdadera", "verdadero", "vez", "vosotras", "vosotros", "voy", "vuestra", "vuestras", "vuestro", "vuestros", "w", "x", "y", "ya", "yo", "z", "él", "éramos", "ésa", "ésas", "ése", "ésos", "ésta", "éstas", "éste", "éstos", "última", "últimas", "último", "últimos" )) SWEDISH = frozenset(( "aderton", "adertonde", "adjö", "aldrig", "alla", "allas", "allt", "alltid", "alltså", "andra", "andras", "annan", "annat", "artonde", "artonn", "att", "av", "bakom", "bara", "behöva", "behövas", "behövde", "behövt", "beslut", "beslutat", "beslutit", "bland", "blev", "bli", "blir", "blivit", "bort", "borta", "bra", "bäst", "bättre", "båda", "bådas", "dag", "dagar", "dagarna", "dagen", "de", "del", "delen", "dem", "den", "denna", "deras", "dess", "dessa", "det", "detta", "dig", "din", "dina", "dit", "ditt", "dock", "dom", "du", "där", "därför", "då", "e", "efter", "eftersom", "ej", "elfte", "eller", "elva", "emot", "en", "enkel", "enkelt", "enkla", "enligt", "ens", "er", "era", "ers", "ert", "ett", "ettusen", "fanns", "fem", "femte", "femtio", "femtionde", "femton", "femtonde", "fick", "fin", "finnas", "finns", "fjorton", "fjortonde", "fjärde", "fler", "flera", "flesta", "fram", "framför", "från", "fyra", "fyrtio", "fyrtionde", "få", "får", "fått", "följande", "för", "före", "förlåt", "förra", "första", "genast", "genom", "gick", "gjorde", "gjort", "god", "goda", "godare", "godast", "gott", "gälla", "gäller", "gällt", "gärna", "gå", "går", "gått", "gör", "göra", "ha", "hade", "haft", "han", "hans", "har", "heller", "hellre", "helst", "helt", "henne", "hennes", "hit", "hon", "honom", "hundra", "hundraen", "hundraett", "hur", "här", "hög", "höger", "högre", "högst", "i", "ibland", "icke", "idag", "igen", "igår", "imorgon", "in", "inför", "inga", "ingen", "ingenting", "inget", "innan", "inne", "inom", "inte", "inuti", "ja", "jag", "jo", "ju", "just", "jämfört", "kan", "kanske", "knappast", "kom", "komma", "kommer", "kommit", "kr", "kunde", "kunna", "kunnat", "kvar", "legat", "ligga", "ligger", "lika", "likställd", "likställda", "lilla", "lite", "liten", "litet", "länge", "längre", "längst", "lätt", "lättare", "lättast", "långsam", "långsammare", "långsammast", "långsamt", "långt", "låt", "man", "med", "mej", "mellan", "men", "mer", "mera", "mest", "mig", "min", "mina", "mindre", "minst", "mitt", "mittemot", "mot", "mycket", "många", "måste", "möjlig", "möjligen", "möjligt", "möjligtvis", "ned", "nederst", "nedersta", "nedre", "nej", "ner", "ni", "nio", "nionde", "nittio", "nittionde", "nitton", "nittonde", "nog", "noll", "nr", "nu", "nummer", "när", "nästa", "någon", "någonting", "något", "några", "nån", "nånting", "nåt", "nödvändig", "nödvändiga", "nödvändigt", "nödvändigtvis", "och", "också", "ofta", "oftast", "olika", "olikt", "om", "oss", "på", "rakt", "redan", "rätt", "sa", "sade", "sagt", "samma", "sedan", "senare", "senast", "sent", "sex", "sextio", "sextionde", "sexton", "sextonde", "sig", "sin", "sina", "sist", "sista", "siste", "sitt", "sitta", "sju", "sjunde", "sjuttio", "sjuttionde", "sjutton", "sjuttonde", "själv", "sjätte", "ska", "skall", "skulle", "slutligen", "små", "smått", "snart", "som", "stor", "stora", "stort", "större", "störst", "säga", "säger", "sämre", "sämst", "så", "sådan", "sådana", "sådant", "ta", "tack", "tar", "tidig", "tidigare", "tidigast", "tidigt", "till", "tills", "tillsammans", "tio", "tionde", "tjugo", "tjugoen", "tjugoett", "tjugonde", "tjugotre", "tjugotvå", "tjungo", "tolfte", "tolv", "tre", "tredje", "trettio", "trettionde", "tretton", "trettonde", "två", "tvåhundra", "under", "upp", "ur", "ursäkt", "ut", "utan", "utanför", "ute", "va", "vad", "var", "vara", "varför", "varifrån", "varit", "varje", "varken", "vars", "varsågod", "vart", "vem", "vems", "verkligen", "vi", "vid", "vidare", "viktig", "viktigare", "viktigast", "viktigt", "vilka", "vilkas", "vilken", "vilket", "vill", "väl", "vänster", "vänstra", "värre", "vår", "våra", "vårt", "än", "ännu", "är", "även", "åt", "åtminstone", "åtta", "åttio", "åttionde", "åttonde", "över", "övermorgon", "överst", "övre" )) TAMIL = frozenset(( "அங்கு", "அங்கே", "அடுத்த", "அதனால்", "அதன்", "அதற்கு", "அதிக", "அதில்", "அது", "அதே", "அதை", "அந்த", "அந்தக்", "அந்தப்", "அன்று", "அல்லது", "அவன்", "அவரது", "அவர்", "அவர்கள்", "அவள்", "அவை", "ஆகிய", "ஆகியோர்", "ஆகும்", "ஆனால்", "இங்கு", "இங்கே", "இடத்தில்", "இடம்", "இதனால்", "இதனை", "இதன்", "இதற்கு", "இதில்", "இது", "இதை", "இந்த", "இந்தக்", "இந்தத்", "இந்தப்", "இன்னும்", "இப்போது", "இரு", "இருக்கும்", "இருந்த", "இருந்தது", "இருந்து", "இல்லை", "இவர்", "இவை", "உன்", "உள்ள", "உள்ளது", "உள்ளன", "எந்த", "என", "எனக்", "எனக்கு", "எனப்படும்", "எனவும்", "எனவே", "எனினும்", "எனும்", "என்", "என்ன", "என்னும்", "என்பது", "என்பதை", "என்ற", "என்று", "என்றும்", "எல்லாம்", "ஏன்", "ஒரு", "ஒரே", "ஓர்", "கொண்ட", "கொண்டு", "கொள்ள", "சற்று", "சிறு", "சில", "சேர்ந்த", "தனது", "தன்", "தவிர", "தான்", "நான்", "நாம்", "நீ", "பற்றி", "பற்றிய", "பல", "பலரும்", "பல்வேறு", "பின்", "பின்னர்", "பிற", "பிறகு", "பெரும்", "பேர்", "போது", "போன்ற", "போல", "போல்", "மட்டுமே", "மட்டும்", "மற்ற", "மற்றும்", "மிக", "மிகவும்", "மீது", "முதல்", "முறை", "மேலும்", "மேல்", "யார்", "வந்த", "வந்து", "வரும்", "வரை", "வரையில்", "விட", "விட்டு", "வேண்டும்", "வேறு" )) TURKISH = frozenset(( "acaba", "acep", "adamakıllı", "adeta", "ait", "altmýþ", "altmış", "altý", "altı", "ama", "amma", "anca", "ancak", "arada", "artýk", "aslında", "aynen", "ayrıca", "az", "açıkça", "açıkçası", "bana", "bari", "bazen", "bazý", "bazı", "başkası", "baţka", "belki", "ben", "benden", "beni", "benim", "beri", "beriki", "beþ", "beş", "beţ", "bilcümle", "bile", "bin", "binaen", "binaenaleyh", "bir", "biraz", "birazdan", "birbiri", "birden", "birdenbire", "biri", "birice", "birileri", "birisi", "birkaç", "birkaçı", "birkez", "birlikte", "birçok", "birçoğu", "birþey", "birþeyi", "birşey", "birşeyi", "birţey", "bitevi", "biteviye", "bittabi", "biz", "bizatihi", "bizce", "bizcileyin", "bizden", "bize", "bizi", "bizim", "bizimki", "bizzat", "boşuna", "bu", "buna", "bunda", "bundan", "bunlar", "bunları", "bunların", "bunu", "bunun", "buracıkta", "burada", "buradan", "burası", "böyle", "böylece", "böylecene", "böylelikle", "böylemesine", "böylesine", "büsbütün", "bütün", "cuk", "cümlesi", "da", "daha", "dahi", "dahil", "dahilen", "daima", "dair", "dayanarak", "de", "defa", "dek", "demin", "demincek", "deminden", "denli", "derakap", "derhal", "derken", "deđil", "değil", "değin", "diye", "diđer", "diğer", "diğeri", "doksan", "dokuz", "dolayı", "dolayısıyla", "doğru", "dört", "edecek", "eden", "ederek", "edilecek", "ediliyor", "edilmesi", "ediyor", "elbet", "elbette", "elli", "emme", "en", "enikonu", "epey", "epeyce", "epeyi", "esasen", "esnasında", "etmesi", "etraflı", "etraflıca", "etti", "ettiği", "ettiğini", "evleviyetle", "evvel", "evvela", "evvelce", "evvelden", "evvelemirde", "evveli", "eđer", "eğer", "fakat", "filanca", "gah", "gayet", "gayetle", "gayri", "gayrı", "gelgelelim", "gene", "gerek", "gerçi", "geçende", "geçenlerde", "gibi", "gibilerden", "gibisinden", "gine", "göre", "gırla", "hakeza", "halbuki", "halen", "halihazırda", "haliyle", "handiyse", "hangi", "hangisi", "hani", "hariç", "hasebiyle", "hasılı", "hatta", "hele", "hem", "henüz", "hep", "hepsi", "her", "herhangi", "herkes", "herkesin", "hiç", "hiçbir", "hiçbiri", "hoş", "hulasaten", "iken", "iki", "ila", "ile", "ilen", "ilgili", "ilk", "illa", "illaki", "imdi", "indinde", "inen", "insermi", "ise", "ister", "itibaren", "itibariyle", "itibarıyla", "iyi", "iyice", "iyicene", "için", "iş", "işte", "iţte", "kadar", "kaffesi", "kah", "kala", "kanýmca", "karşın", "katrilyon", "kaynak", "kaçı", "kelli", "kendi", "kendilerine", "kendini", "kendisi", "kendisine", "kendisini", "kere", "kez", "keza", "kezalik", "keşke", "keţke", "ki", "kim", "kimden", "kime", "kimi", "kimisi", "kimse", "kimsecik", "kimsecikler", "külliyen", "kýrk", "kýsaca", "kırk", "kısaca", "lakin", "leh", "lütfen", "maada", "madem", "mademki", "mamafih", "mebni", "međer", "meğer", "meğerki", "meğerse", "milyar", "milyon", "mu", "mü", "mý", "mı", "nasýl", "nasıl", "nasılsa", "nazaran", "naşi", "ne", "neden", "nedeniyle", "nedenle", "nedense", "nerde", "nerden", "nerdeyse", "nere", "nerede", "nereden", "neredeyse", "neresi", "nereye", "netekim", "neye", "neyi", "neyse", "nice", "nihayet", "nihayetinde", "nitekim", "niye", "niçin", "o", "olan", "olarak", "oldu", "olduklarını", "oldukça", "olduğu", "olduğunu", "olmadı", "olmadığı", "olmak", "olması", "olmayan", "olmaz", "olsa", "olsun", "olup", "olur", "olursa", "oluyor", "on", "ona", "onca", "onculayın", "onda", "ondan", "onlar", "onlardan", "onlari", "onlarýn", "onları", "onların", "onu", "onun", "oracık", "oracıkta", "orada", "oradan", "oranca", "oranla", "oraya", "otuz", "oysa", "oysaki", "pek", "pekala", "peki", "pekçe", "peyderpey", "rağmen", "sadece", "sahi", "sahiden", "sana", "sanki", "sekiz", "seksen", "sen", "senden", "seni", "senin", "siz", "sizden", "sizi", "sizin", "sonra", "sonradan", "sonraları", "sonunda", "tabii", "tam", "tamam", "tamamen", "tamamıyla", "tarafından", "tek", "trilyon", "tüm", "var", "vardı", "vasıtasıyla", "ve", "velev", "velhasıl", "velhasılıkelam", "veya", "veyahut", "ya", "yahut", "yakinen", "yakında", "yakından", "yakınlarda", "yalnız", "yalnızca", "yani", "yapacak", "yapmak", "yaptı", "yaptıkları", "yaptığı", "yaptığını", "yapılan", "yapılması", "yapıyor", "yedi", "yeniden", "yenilerde", "yerine", "yetmiþ", "yetmiş", "yetmiţ", "yine", "yirmi", "yok", "yoksa", "yoluyla", "yüz", "yüzünden", "zarfında", "zaten", "zati", "zira", "çabuk", "çabukça", "çeşitli", "çok", "çokları", "çoklarınca", "çokluk", "çoklukla", "çokça", "çoğu", "çoğun", "çoğunca", "çoğunlukla", "çünkü", "öbür", "öbürkü", "öbürü", "önce", "önceden", "önceleri", "öncelikle", "öteki", "ötekisi", "öyle", "öylece", "öylelikle", "öylemesine", "öz", "üzere", "üç", "þey", "þeyden", "þeyi", "þeyler", "þu", "þuna", "þunda", "þundan", "þunu", "şayet", "şey", "şeyden", "şeyi", "şeyler", "şu", "şuna", "şuncacık", "şunda", "şundan", "şunlar", "şunları", "şunu", "şunun", "şura", "şuracık", "şuracıkta", "şurası", "şöyle", "ţayet", "ţimdi", "ţu", "ţöyle" )) YIDDISH = frozenset(( "א", "אבער", "אדער", "אונ", "אז", "אזױ", "איב", "איז", "אימ", "אינ", "איר", "אלס", "אלע", "אנ", "אפילו", "אױב", "אױכ", "אױס", "אױפ", "ביז", "בײ", "דא", "דאס", "דורכ", "די", "דעמ", "דער", "האב", "האט", "זאל", "זי", "זיכ", "זענ", "זײ", "זײנ", "זײער", "מיט", "מענ", "מער", "נאכ", "נאר", "ניש", "עס", "ער", "פאר", "פונ", "צו", "צומ", "צװיש", "קעג", "קענ", "קײנ", "שױנ", "װאס", "װאר", "װי", "װעט", "װעלכ", "װענ", "װער" )) # Maps Snowball stemmer language name -> stopword set # porter -> english, dutch_porter -> dutch _BUILTIN_STOPWORDS: dict[str, frozenset[str]] = { "arabic": ARABIC, "armenian": ARMENIAN, "basque": BASQUE, "catalan": CATALAN, "danish": DANISH, "dutch": DUTCH, "dutch_porter": DUTCH, "english": ENGLISH, "esperanto": ESPERANTO, "estonian": ESTONIAN, "finnish": FINNISH, "french": FRENCH, "german": GERMAN, "greek": GREEK, "hindi": HINDI, "hungarian": HUNGARIAN, "indonesian": INDONESIAN, "irish": IRISH, "italian": ITALIAN, "lithuanian": LITHUANIAN, "nepali": NEPALI, "norwegian": NORWEGIAN, "porter": ENGLISH, "portuguese": PORTUGUESE, "romanian": ROMANIAN, "russian": RUSSIAN, "serbian": SERBIAN, "spanish": SPANISH, "swedish": SWEDISH, "tamil": TAMIL, "turkish": TURKISH, "yiddish": YIDDISH, } hickeroar-simplebayes-4803e70/simplebayes/tokenization.py000066400000000000000000000041741514723130200236610ustar00rootroot00000000000000import re import unicodedata from typing import Callable, List, Set import snowballstemmer from simplebayes.stopwords_data import _BUILTIN_STOPWORDS TOKEN_SPLIT_PATTERN = re.compile(r"[^\w]+", re.UNICODE) _STOPWORDS_CACHE: dict[str, Set[str]] = {} def _get_stop_words(language: str) -> Set[str]: """Return built-in stop words for the language. Empty set if unavailable.""" if language in _STOPWORDS_CACHE: return _STOPWORDS_CACHE[language] words = set(_BUILTIN_STOPWORDS.get(language, ())) _STOPWORDS_CACHE[language] = words return words def create_tokenizer( language: str = "english", remove_stop_words: bool = False, ) -> Callable[[str], List[str]]: """ Create a tokenizer with the given language and stop-word settings. :param language: Language code for stemmer and stop words (e.g. "english", "spanish"). :param remove_stop_words: If True, filter out stop words. Default False (backwards compatible). :return: A tokenize function. """ stemmer = snowballstemmer.stemmer(language) stop_words: Set[str] = _get_stop_words(language) if remove_stop_words else set() def tokenize(text: str) -> List[str]: if not text: return [] normalized = unicodedata.normalize("NFKC", text).lower() raw_tokens = [ t for t in TOKEN_SPLIT_PATTERN.split(normalized) if t ] if not raw_tokens: return [] stemmed = stemmer.stemWords(raw_tokens) if stop_words: return [t for t in stemmed if t and t not in stop_words] return [t for t in stemmed if t] return tokenize def default_tokenize_text( text: str, language: str = "english", remove_stop_words: bool = False, ) -> List[str]: """ Normalizes, tokenizes, stems, and optionally removes stop words. :param text: Input text. :param language: Language code. Default "english". :param remove_stop_words: If True, filter stop words. Default False (backwards compatible). :return: List of tokens. """ return create_tokenizer(language=language, remove_stop_words=remove_stop_words)( text ) hickeroar-simplebayes-4803e70/tests/000077500000000000000000000000001514723130200174105ustar00rootroot00000000000000hickeroar-simplebayes-4803e70/tests/__init__.py000066400000000000000000000231341514723130200215240ustar00rootroot00000000000000# pylint: disable=invalid-name,missing-docstring import unittest from unittest.mock import patch, MagicMock from simplebayes import SimpleBayes from simplebayes.categories import BayesCategories from simplebayes.errors import InvalidCategoryError class SimpleBayesTests(unittest.TestCase): def test_tokenizer(self): sb = SimpleBayes() result = sb.tokenizer('hello world') self.assertEqual(result, ['hello', 'world']) self.assertEqual(SimpleBayes.tokenize_text('hello world'), ['hello', 'world']) def test_count_token_occurrences(self): sb = SimpleBayes() result = sb.count_token_occurrences(['hello', 'world', 'hello']) self.assertEqual( result, { 'hello': 2, 'world': 1 } ) def test_flush_and_tally(self): sb = SimpleBayes() sb.train('foo', 'hello world hello') self.assertEqual(sb.tally('foo'), 3) sb.flush() self.assertEqual(sb.tally('foo'), 0) def test_untrain(self): sb = SimpleBayes() sb.train('foo', 'hello world hello') self.assertEqual(sb.tally('foo'), 3) self.assertEqual(sb.tally('bar'), 0) sb.untrain('bar', 'for bar baz') self.assertEqual(sb.tally('foo'), 3) self.assertEqual(sb.tally('bar'), 0) sb.untrain('foo', 'hello world') self.assertEqual(sb.tally('foo'), 1) @patch.object(BayesCategories, 'get_category') def test_train_with_existing_category(self, get_category_mock): cat_mock = MagicMock() cat_mock.train_token.return_value = None get_category_mock.return_value = cat_mock sb = SimpleBayes() sb.train('foo', 'hello world hello') get_category_mock.assert_called_once_with('foo') cat_mock.train_token.assert_any_call('hello', 2) cat_mock.train_token.assert_any_call('world', 1) @patch.object(BayesCategories, 'get_category') @patch.object(BayesCategories, 'add_category') def test_train_with_new_category( self, add_category_mock, get_category_mock ): cat_mock = MagicMock() cat_mock.train_token.return_value = None get_category_mock.side_effect = KeyError() add_category_mock.return_value = cat_mock sb = SimpleBayes() sb.train('foo', 'hello world hello') add_category_mock.assert_called_with('foo') cat_mock.train_token.assert_any_call('hello', 2) cat_mock.train_token.assert_any_call('world', 1) @patch.object(BayesCategories, 'get_categories') def test_classify(self, get_categories_mock): cat1_mock = MagicMock() cat1_mock.get_token_count.return_value = 2 cat1_mock.get_tally.return_value = 8 cat2_mock = MagicMock() cat2_mock.get_token_count.return_value = 4 cat2_mock.get_tally.return_value = 32 get_categories_mock.return_value = { 'foo': cat1_mock, 'bar': cat2_mock } sb = SimpleBayes() sb.calculate_category_probability() result = sb.classify('hello world') self.assertEqual('bar', result) assert 3 == get_categories_mock.call_count, \ get_categories_mock.call_count cat1_mock.get_token_count.assert_any_call('hello') cat1_mock.get_token_count.assert_any_call('world') cat1_mock.get_tally.assert_called_once_with() cat2_mock.get_token_count.assert_any_call('hello') cat2_mock.get_token_count.assert_any_call('world') cat2_mock.get_tally.assert_called_once_with() @patch.object(BayesCategories, 'get_categories') def test_classify_without_categories(self, get_categories_mock): get_categories_mock.return_value = {} sb = SimpleBayes() result = sb.classify('hello world') self.assertIsNone(result) assert 2 == get_categories_mock.call_count, \ get_categories_mock.call_count @patch.object(BayesCategories, 'get_categories') def test_classify_with_empty_category(self, get_categories_mock): cat_mock = MagicMock() cat_mock.get_tally.return_value = 0 cat_mock.get_token_count.return_value = 0 get_categories_mock.return_value = { 'foo': cat_mock } sb = SimpleBayes() sb.calculate_category_probability() result = sb.classify('hello world') self.assertIsNone(result) assert 3 == get_categories_mock.call_count, \ get_categories_mock.call_count cat_mock.get_tally.assert_called_once_with() def test_score_without_categories(self): sb = SimpleBayes() self.assertEqual(sb.score('hello world'), {}) def test_score_with_no_matching_tokens(self): sb = SimpleBayes() sb.train('alpha', 'one two three') self.assertEqual(sb.score('unknown tokens here'), {}) @patch.object(BayesCategories, 'get_categories') def test_score(self, get_categories_mock): cat1_mock = MagicMock() cat1_mock.get_token_count.return_value = 2 cat1_mock.get_tally.return_value = 8 cat2_mock = MagicMock() cat2_mock.get_token_count.return_value = 4 cat2_mock.get_tally.return_value = 32 get_categories_mock.return_value = { 'foo': cat1_mock, 'bar': cat2_mock } sb = SimpleBayes() sb.calculate_category_probability() result = sb.score('hello world') self.assertIn('foo', result) self.assertIn('bar', result) self.assertAlmostEqual(result['foo'], 0.22222222222222224) self.assertAlmostEqual(result['bar'], 1.777777777777778) assert 3 == get_categories_mock.call_count, \ get_categories_mock.call_count cat1_mock.get_token_count.assert_any_call('hello') cat1_mock.get_token_count.assert_any_call('world') cat1_mock.get_tally.assert_called_once_with() cat2_mock.get_token_count.assert_any_call('hello') cat2_mock.get_token_count.assert_any_call('world') cat2_mock.get_tally.assert_called_once_with() @patch.object(BayesCategories, 'get_categories') def test_score_with_zero_bayes_denon(self, get_categories_mock): cat1_mock = MagicMock() cat1_mock.get_token_count.return_value = 2 cat1_mock.get_tally.return_value = 8 cat2_mock = MagicMock() cat2_mock.get_token_count.return_value = 4 cat2_mock.get_tally.return_value = 32 get_categories_mock.return_value = { 'foo': cat1_mock, 'bar': cat2_mock } sb = SimpleBayes() sb.calculate_category_probability() sb.probabilities['foo']['prc'] = 0 sb.probabilities['foo']['prnc'] = 0 result = sb.score('hello world') self.assertEqual( { 'bar': 1.777777777777778 }, result ) assert 3 == get_categories_mock.call_count, \ get_categories_mock.call_count cat1_mock.get_token_count.assert_any_call('hello') cat1_mock.get_token_count.assert_any_call('world') cat1_mock.get_tally.assert_called_once_with() cat2_mock.get_token_count.assert_any_call('hello') cat2_mock.get_token_count.assert_any_call('world') cat2_mock.get_tally.assert_called_once_with() def test_classify_result(self): sb = SimpleBayes() sb.train('good', 'bright happy joy') sb.train('bad', 'sad dark doom') result = sb.classify_result('bright joy') self.assertEqual(result.category, 'good') self.assertGreater(result.score, 0) def test_classify_result_empty(self): sb = SimpleBayes() result = sb.classify_result('anything') self.assertIsNone(result.category) self.assertEqual(result.score, 0.0) def test_get_summaries(self): sb = SimpleBayes() sb.train('alpha', 'one two three') summaries = sb.get_summaries() self.assertIn('alpha', summaries) self.assertEqual(summaries['alpha'].token_tally, 3) self.assertGreaterEqual(summaries['alpha'].prob_in_cat, 0.0) self.assertGreaterEqual(summaries['alpha'].prob_not_in_cat, 0.0) def test_train_invalid_category_raises(self): sb = SimpleBayes() with self.assertRaises(InvalidCategoryError): sb.train('bad category', 'text') with self.assertRaises(InvalidCategoryError): sb.train(None, 'text') # type: ignore[arg-type] def test_untrain_removes_empty_category(self): sb = SimpleBayes() sb.train('alpha', 'one two three') sb.untrain('alpha', 'one two three') self.assertNotIn('alpha', sb.categories.get_categories()) self.assertNotIn('alpha', sb.probabilities) self.assertNotIn('alpha', sb.get_summaries()) def test_classify_tie_breaks_lexically(self): sb = SimpleBayes() sb.train('zeta', 'match token') sb.train('alpha', 'match token') result = sb.classify('match token') self.assertEqual(result, 'alpha') def test_laplace_smoothing_alpha(self): sb = SimpleBayes(alpha=0.01) sb.train('spam', 'buy now click here') sb.train('ham', 'meeting tomorrow schedule') result = sb.classify_result('click offer') self.assertIsNotNone(result.category) self.assertGreater(result.score, 0) def test_language_and_remove_stop_words_params(self): sb = SimpleBayes(language="english", remove_stop_words=False) sb.train("foo", "the cat is in the hat") self.assertGreater(sb.tally("foo"), 2) # stop words counted hickeroar-simplebayes-4803e70/tests/build.sh000077500000000000000000000005251514723130200210500ustar00rootroot00000000000000#!/bin/bash set -e echo echo " [simplebayes] Step 1: Executing Unit Tests" echo pytest tests/ --cov=simplebayes --cov-fail-under=100 -v rm -f .coverage* echo echo " [simplebayes] Step 2: Executing flake8" echo flake8 simplebayes tests echo echo " [simplebayes] Step 3: Executing pylint" echo pylint simplebayes tests --fail-under=10 echo hickeroar-simplebayes-4803e70/tests/categories.py000066400000000000000000000013471514723130200221140ustar00rootroot00000000000000# pylint: disable=invalid-name,missing-docstring from simplebayes.categories import BayesCategories from simplebayes.category import BayesCategory import unittest class BayesCategoriesTests(unittest.TestCase): def test_add_category(self): bc = BayesCategories() bc.add_category('foo') self.assertIn('foo', bc.categories) self.assertIsInstance(bc.categories['foo'], BayesCategory) def test_get_category(self): bc = BayesCategories() bc.add_category('foo') self.assertIsInstance(bc.get_category('foo'), BayesCategory) def test_get_categories(self): bc = BayesCategories() bc.add_category('foo') self.assertEqual(bc.get_categories(), bc.categories) hickeroar-simplebayes-4803e70/tests/category.py000066400000000000000000000025101514723130200215750ustar00rootroot00000000000000# pylint: disable=invalid-name,missing-docstring from simplebayes.category import BayesCategory import unittest class BayesCategoryTests(unittest.TestCase): def test_train_token(self): bc = BayesCategory('foo') bc.train_token('foo', 5) bc.train_token('bar', 7) self.assertEqual(12, bc.tally) self.assertIn('foo', bc.tokens) self.assertEqual(bc.tokens['foo'], 5) def test_untrain_token(self): bc = BayesCategory('foo') bc.train_token('foo', 5) bc.train_token('bar', 7) self.assertEqual(12, bc.tally) self.assertIn('foo', bc.tokens) self.assertIn('bar', bc.tokens) self.assertEqual(bc.tokens['foo'], 5) self.assertEqual(bc.tokens['bar'], 7) bc.untrain_token('foo', 3) bc.untrain_token('bar', 20) bc.untrain_token('baz', 5) self.assertEqual(2, bc.tally) self.assertEqual(bc.tokens['foo'], 2) self.assertNotIn('bar', bc.tokens) def test_get_token_count(self): bc = BayesCategory('foo') bc.train_token('foo', 5) self.assertEqual(bc.get_token_count('foo'), 5) self.assertEqual(bc.get_token_count('bar'), 0) def test_get_tally(self): bc = BayesCategory('foo') bc.train_token('foo', 5) self.assertEqual(5, bc.get_tally()) hickeroar-simplebayes-4803e70/tests/test.py000066400000000000000000000005051514723130200207410ustar00rootroot00000000000000#!/usr/bin/python # pylint: disable=unused-wildcard-import,wildcard-import,unused-import # Test aggregator: star imports register test classes (F401/F403 in .flake8) import simplebayes import simplebayes.categories import simplebayes.category from tests import * from tests.categories import * from tests.category import * hickeroar-simplebayes-4803e70/tests/test_api_endpoints.py000066400000000000000000000264301514723130200236620ustar00rootroot00000000000000from fastapi.testclient import TestClient import pytest from simplebayes.api.app import create_app def test_health_and_ready_endpoints(): app = create_app() client = TestClient(app) assert client.get("/healthz").json() == {"status": "ok"} assert client.get("/readyz").json() == {"status": "ready"} def test_readyz_returns_503_when_not_ready(): app = create_app() app.state.readiness.mark_not_ready() client = TestClient(app) response = client.get("/readyz") assert response.status_code == 503 assert response.json() == {"status": "not ready"} def test_lifespan_marks_not_ready_on_shutdown(): app = create_app() with TestClient(app) as client: assert client.get("/readyz").status_code == 200 assert app.state.readiness.is_ready is True assert app.state.readiness.is_ready is False def test_train_info_score_classify_and_flush_flow(): client = TestClient(create_app()) headers = {"Content-Type": "text/plain"} train_response = client.post("/train/spam", content="buy now limited offer", headers=headers) assert train_response.status_code == 200 assert train_response.json()["success"] is True info_response = client.get("/info") assert info_response.status_code == 200 assert "spam" in info_response.json()["categories"] score_response = client.post("/score", content="limited offer", headers=headers) assert score_response.status_code == 200 assert "spam" in score_response.json() classify_response = client.post("/classify", content="limited offer", headers=headers) assert classify_response.status_code == 200 assert classify_response.json()["category"] == "spam" untrain_response = client.post("/untrain/spam", content="buy now limited offer", headers=headers) assert untrain_response.status_code == 200 assert untrain_response.json()["success"] is True flush_response = client.post("/flush", content="", headers=headers) assert flush_response.status_code == 200 assert flush_response.json() == {"success": True, "categories": {}} def test_invalid_category_route_returns_422(): client = TestClient(create_app()) response = client.post( "/train/bad route", content="sample", headers={"Content-Type": "text/plain"}, ) assert response.status_code == 422 def test_wrong_method_returns_405(): client = TestClient(create_app()) response = client.get("/classify") assert response.status_code == 405 def test_auth_required_for_non_probe_endpoints(): client = TestClient(create_app(auth_token="secret-token")) unauthorized = client.get("/info") assert unauthorized.status_code == 401 assert unauthorized.json() == {"error": "unauthorized"} assert unauthorized.headers["www-authenticate"] == 'Bearer realm="simplebayes"' wrong_token = client.get("/info", headers={"Authorization": "Bearer wrong-token"}) assert wrong_token.status_code == 401 assert wrong_token.json() == {"error": "unauthorized"} authorized = client.get("/info", headers={"Authorization": "Bearer secret-token"}) assert authorized.status_code == 200 def test_probes_remain_unauthenticated_with_auth_enabled(): client = TestClient(create_app(auth_token="secret-token")) assert client.get("/healthz").status_code == 200 assert client.get("/readyz").status_code == 200 def test_payload_too_large_returns_413(): client = TestClient(create_app()) too_large = "x" * (1024 * 1024 + 1) response = client.post( "/score", content=too_large, headers={"Content-Type": "text/plain"}, ) assert response.status_code == 413 assert response.json() == {"error": "request body too large"} def test_classify_returns_null_category_when_untrained(): client = TestClient(create_app()) response = client.post( "/classify", content="anything", headers={"Content-Type": "text/plain"}, ) assert response.status_code == 200 assert response.json() == {"category": None, "score": 0.0} @pytest.mark.parametrize("path", ["/train/spam", "/classify", "/score"]) def test_invalid_utf8_payload_returns_400(path): client = TestClient(create_app()) response = client.post( path, content=b"\xff\xfe\xfa", headers={"Content-Type": "text/plain"}, ) assert response.status_code == 400 assert response.json() == {"error": "invalid utf-8 payload"} @pytest.mark.parametrize( "method,path", [ ("post", "/train/spam"), ("post", "/untrain/spam"), ("post", "/classify"), ("post", "/score"), ("post", "/flush"), ], ) def test_auth_rejected_for_each_mutating_endpoint(method, path): client = TestClient(create_app(auth_token="secret-token")) response = getattr(client, method)( path, content="body", headers={"Content-Type": "text/plain"}, ) assert response.status_code == 401 assert response.json() == {"error": "unauthorized"} def test_auth_allows_mutating_endpoints_with_valid_token(): client = TestClient(create_app(auth_token="secret-token")) headers = { "Authorization": "Bearer secret-token", "Content-Type": "text/plain", } train_response = client.post("/train/spam", content="buy now limited offer", headers=headers) assert train_response.status_code == 200 assert train_response.json()["success"] is True score_response = client.post("/score", content="limited offer", headers=headers) assert score_response.status_code == 200 assert "spam" in score_response.json() classify_response = client.post("/classify", content="limited offer", headers=headers) assert classify_response.status_code == 200 assert classify_response.json()["category"] == "spam" untrain_response = client.post("/untrain/spam", content="buy now limited offer", headers=headers) assert untrain_response.status_code == 200 assert untrain_response.json()["success"] is True flush_response = client.post("/flush", content="", headers=headers) assert flush_response.status_code == 200 assert flush_response.json() == {"success": True, "categories": {}} @pytest.mark.parametrize( "auth_header", [ "Token secret-token", "Bearer", "Bearer ", "Basic c2VjcmV0LXRva2Vu", ], ) def test_auth_malformed_headers_are_rejected(auth_header): client = TestClient(create_app(auth_token="secret-token")) response = client.get("/info", headers={"Authorization": auth_header}) assert response.status_code == 401 assert response.json() == {"error": "unauthorized"} assert response.headers["www-authenticate"] == 'Bearer realm="simplebayes"' @pytest.mark.parametrize( "path", [ "/train/spam", "/untrain/spam", "/classify", "/score", "/flush", ], ) def test_payload_exactly_one_mebibyte_is_accepted(path): client = TestClient(create_app()) boundary_payload = "x" * (1024 * 1024) response = client.post( path, content=boundary_payload, headers={"Content-Type": "text/plain"}, ) assert response.status_code == 200 @pytest.mark.parametrize( "path", [ "/train/spam", "/untrain/spam", "/classify", "/score", "/flush", ], ) def test_payload_too_large_for_each_text_endpoint(path): client = TestClient(create_app()) too_large = "x" * (1024 * 1024 + 1) response = client.post( path, content=too_large, headers={"Content-Type": "text/plain"}, ) assert response.status_code == 413 assert response.json() == {"error": "request body too large"} def test_create_app_with_language_spanish(): """Spanish stemmer produces different tokens than English (e.g. comprar vs buy).""" app = create_app(language="spanish") client = TestClient(app) headers = {"Content-Type": "text/plain"} client.post("/train/spam", content="comprar ahora oferta limitada", headers=headers) info = client.get("/info").json() assert "spam" in info["categories"] assert info["categories"]["spam"]["tokenTally"] > 0 score = client.post("/score", content="comprar oferta", headers=headers).json() assert "spam" in score def test_create_app_with_remove_stop_words(): """Stop words like 'the' and 'is' are filtered when remove_stop_words=True.""" app = create_app(remove_stop_words=True) client = TestClient(app) headers = {"Content-Type": "text/plain"} client.post("/train/spam", content="the buy now the offer", headers=headers) client.post("/train/ham", content="the meeting is tomorrow", headers=headers) info = client.get("/info").json() assert "spam" in info["categories"] assert "ham" in info["categories"] classify = client.post("/classify", content="the offer", headers=headers).json() assert classify["category"] == "spam" def test_verbose_mode_logs_to_stderr(capsys): app = create_app(verbose=True) client = TestClient(app) headers = {"Content-Type": "text/plain"} client.post("/train/spam", content="buy now", headers=headers) captured = capsys.readouterr() assert "[simplebayes]" in captured.err assert "POST" in captured.err or "train" in captured.err assert "spam" in captured.err assert "tokens=" in captured.err def test_verbose_mode_off_no_output(capsys): app = create_app(verbose=False) client = TestClient(app) headers = {"Content-Type": "text/plain"} client.post("/train/spam", content="buy now", headers=headers) captured = capsys.readouterr() assert "[simplebayes]" not in captured.err def test_verbose_middleware_content_length_branch(capsys): """Hit the Content-Length branch: GET has no body, POST has Content-Length.""" app = create_app(verbose=True) client = TestClient(app) client.get("/healthz") captured = capsys.readouterr() assert "[simplebayes]" in captured.err assert "GET" in captured.err def test_verbose_middleware_exception_branch(capsys): """Hit the exception branch by simulating a route that raises.""" app = create_app(verbose=True) async def raise_handler(request): raise RuntimeError("test") app.add_route("/raise", raise_handler, methods=["GET"]) client = TestClient(app) with pytest.raises(RuntimeError): client.get("/raise") captured = capsys.readouterr() assert "[simplebayes]" in captured.err assert "(exception)" in captured.err def test_verbose_middleware_large_response_body(capsys): """Hit the body truncation branch when response > 500 chars.""" app = create_app(verbose=True) client = TestClient(app) headers = {"Content-Type": "text/plain"} for i in range(25): client.post(f"/train/cat{i}", content=f"word{i} x y z", headers=headers) client.get("/info") captured = capsys.readouterr() assert "[simplebayes]" in captured.err assert "..." in captured.err def test_verbose_format_tokens_truncation(capsys): """Hit _format_tokens truncation when > 20 tokens.""" app = create_app(verbose=True) client = TestClient(app) headers = {"Content-Type": "text/plain"} long_text = " ".join(f"word{i}" for i in range(25)) client.post("/train/spam", content=long_text, headers=headers) captured = capsys.readouterr() assert "[simplebayes]" in captured.err assert "..." in captured.err hickeroar-simplebayes-4803e70/tests/test_categories.py000066400000000000000000000002351514723130200231460ustar00rootroot00000000000000"""Pytest-discovered wrapper for legacy categories tests.""" from tests.categories import BayesCategoriesTests # noqa: F401 pylint: disable=unused-import hickeroar-simplebayes-4803e70/tests/test_category.py000066400000000000000000000002271514723130200226370ustar00rootroot00000000000000"""Pytest-discovered wrapper for legacy category tests.""" from tests.category import BayesCategoryTests # noqa: F401 pylint: disable=unused-import hickeroar-simplebayes-4803e70/tests/test_cli.py000066400000000000000000000125621514723130200215760ustar00rootroot00000000000000from simplebayes import cli def test_parse_args_defaults(monkeypatch): monkeypatch.delenv("SIMPLEBAYES_HOST", raising=False) monkeypatch.delenv("SIMPLEBAYES_PORT", raising=False) monkeypatch.delenv("SIMPLEBAYES_AUTH_TOKEN", raising=False) monkeypatch.delenv("SIMPLEBAYES_LANGUAGE", raising=False) monkeypatch.delenv("SIMPLEBAYES_REMOVE_STOP_WORDS", raising=False) monkeypatch.delenv("SIMPLEBAYES_VERBOSE", raising=False) args = cli.parse_args([]) assert args.host == "0.0.0.0" assert args.port == 8000 assert args.auth_token == "" assert args.language == "english" assert args.remove_stop_words is False assert args.verbose is False def test_parse_args_uses_env(monkeypatch): monkeypatch.setenv("SIMPLEBAYES_HOST", "127.0.0.1") monkeypatch.setenv("SIMPLEBAYES_PORT", "9000") monkeypatch.setenv("SIMPLEBAYES_AUTH_TOKEN", "env-token") args = cli.parse_args([]) assert args.host == "127.0.0.1" assert args.port == 9000 assert args.auth_token == "env-token" def test_parse_args_cli_overrides_env(monkeypatch): monkeypatch.setenv("SIMPLEBAYES_HOST", "127.0.0.1") monkeypatch.setenv("SIMPLEBAYES_PORT", "9000") monkeypatch.setenv("SIMPLEBAYES_AUTH_TOKEN", "env-token") args = cli.parse_args(["--host", "localhost", "--port", "8123", "--auth-token", "cli-token"]) assert args.host == "localhost" assert args.port == 8123 assert args.auth_token == "cli-token" def test_parse_args_language_default(monkeypatch): monkeypatch.delenv("SIMPLEBAYES_LANGUAGE", raising=False) monkeypatch.delenv("SIMPLEBAYES_REMOVE_STOP_WORDS", raising=False) monkeypatch.delenv("SIMPLEBAYES_VERBOSE", raising=False) args = cli.parse_args([]) assert args.language == "english" def test_parse_args_language_cli(): args = cli.parse_args(["--language", "spanish"]) assert args.language == "spanish" def test_parse_args_language_env(monkeypatch): monkeypatch.setenv("SIMPLEBAYES_LANGUAGE", "spanish") args = cli.parse_args([]) assert args.language == "spanish" def test_parse_args_remove_stop_words_default(monkeypatch): monkeypatch.delenv("SIMPLEBAYES_REMOVE_STOP_WORDS", raising=False) args = cli.parse_args([]) assert args.remove_stop_words is False def test_parse_args_remove_stop_words_flag(): args = cli.parse_args(["--remove-stop-words"]) assert args.remove_stop_words is True def test_parse_args_remove_stop_words_env(monkeypatch): monkeypatch.setenv("SIMPLEBAYES_REMOVE_STOP_WORDS", "1") args = cli.parse_args([]) assert args.remove_stop_words is True def test_parse_args_remove_stop_words_env_yes(monkeypatch): monkeypatch.setenv("SIMPLEBAYES_REMOVE_STOP_WORDS", "yes") args = cli.parse_args([]) assert args.remove_stop_words is True def test_parse_args_remove_stop_words_env_false(monkeypatch): monkeypatch.setenv("SIMPLEBAYES_REMOVE_STOP_WORDS", "0") args = cli.parse_args([]) assert args.remove_stop_words is False def test_parse_args_verbose_default(monkeypatch): monkeypatch.delenv("SIMPLEBAYES_VERBOSE", raising=False) args = cli.parse_args([]) assert args.verbose is False def test_parse_args_verbose_flag(): args = cli.parse_args(["--verbose"]) assert args.verbose is True def test_parse_args_verbose_env(monkeypatch): monkeypatch.setenv("SIMPLEBAYES_VERBOSE", "true") args = cli.parse_args([]) assert args.verbose is True def test_run_invokes_uvicorn(monkeypatch): captured = {} def fake_create_app( auth_token: str = "", language: str = "english", remove_stop_words: bool = False, verbose: bool = False, ): captured["auth_token"] = auth_token captured["language"] = language captured["remove_stop_words"] = remove_stop_words captured["verbose"] = verbose return "app-object" def fake_uvicorn_run(app, host, port): captured["app"] = app captured["host"] = host captured["port"] = port monkeypatch.setattr(cli, "create_app", fake_create_app) monkeypatch.setattr(cli.uvicorn, "run", fake_uvicorn_run) cli.run(["--host", "localhost", "--port", "8181", "--auth-token", "top-secret"]) assert captured["auth_token"] == "top-secret" assert captured["language"] == "english" assert captured["remove_stop_words"] is False assert captured["verbose"] is False assert captured["app"] == "app-object" assert captured["host"] == "localhost" assert captured["port"] == 8181 def test_run_passes_language_remove_stop_words_verbose(monkeypatch): captured = {} def fake_create_app( auth_token: str = "", language: str = "english", remove_stop_words: bool = False, verbose: bool = False, ): captured["auth_token"] = auth_token captured["language"] = language captured["remove_stop_words"] = remove_stop_words captured["verbose"] = verbose return "app-object" monkeypatch.setattr(cli, "create_app", fake_create_app) monkeypatch.setattr(cli.uvicorn, "run", lambda *a, **k: None) cli.run( [ "--auth-token", "x", "--language", "spanish", "--remove-stop-words", "--verbose", ] ) assert captured["language"] == "spanish" assert captured["remove_stop_words"] is True assert captured["verbose"] is True hickeroar-simplebayes-4803e70/tests/test_cli_integration.py000066400000000000000000000042621514723130200241770ustar00rootroot00000000000000import os import socket import subprocess import sys import time import urllib.error import urllib.request def _find_free_port() -> int: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.bind(("127.0.0.1", 0)) return sock.getsockname()[1] def _wait_for_healthz(port: int, timeout_seconds: float = 8.0) -> bool: deadline = time.time() + timeout_seconds url = f"http://127.0.0.1:{port}/healthz" while time.time() < deadline: try: with urllib.request.urlopen(url, timeout=0.5) as response: return response.status == 200 except (urllib.error.URLError, TimeoutError): time.sleep(0.2) return False def test_cli_module_help_exits_zero(): result = subprocess.run( [sys.executable, "-m", "simplebayes.cli", "--help"], capture_output=True, text=True, timeout=10, check=False, ) assert result.returncode == 0 assert "Run the simplebayes API server." in result.stdout assert "language" in result.stdout assert "remove-stop-words" in result.stdout assert "verbose" in result.stdout def test_cli_module_fails_with_invalid_env_port(): env = os.environ.copy() env["SIMPLEBAYES_PORT"] = "not-a-number" result = subprocess.run( [sys.executable, "-m", "simplebayes.cli"], capture_output=True, text=True, timeout=10, env=env, check=False, ) assert result.returncode != 0 assert "invalid literal for int()" in result.stderr def test_cli_server_starts_and_serves_healthz(): port = _find_free_port() with subprocess.Popen( [ sys.executable, "-m", "simplebayes.cli", "--host", "127.0.0.1", "--port", str(port), ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) as process: try: assert _wait_for_healthz(port) finally: process.terminate() try: process.wait(timeout=5) except subprocess.TimeoutExpired: process.kill() process.wait(timeout=5) hickeroar-simplebayes-4803e70/tests/test_concurrency.py000066400000000000000000000032401514723130200233520ustar00rootroot00000000000000from concurrent.futures import ThreadPoolExecutor from simplebayes import SimpleBayes def test_parallel_train_and_score_completes(): classifier = SimpleBayes() def train_and_score(index: int) -> None: classifier.train("tech", f"python fastapi service sample {index}") _ = classifier.score("python service") with ThreadPoolExecutor(max_workers=8) as pool: list(pool.map(train_and_score, range(50))) result = classifier.classify_result("python service") assert result.category == "tech" assert result.score > 0 summaries = classifier.get_summaries() assert summaries["tech"].token_tally == 250 assert classifier.tally("tech") == 250 def test_parallel_classify_during_mutation(): classifier = SimpleBayes() classifier.train("alpha", "one two three") classifier.train("beta", "four five six") def mutate() -> None: for _ in range(50): classifier.train("alpha", "one two three") classifier.untrain("alpha", "one") def classify() -> None: for _ in range(50): _ = classifier.classify("one five") _ = classifier.get_summaries() with ThreadPoolExecutor(max_workers=4) as pool: futures = [pool.submit(mutate), pool.submit(classify), pool.submit(classify)] for future in futures: future.result() assert classifier.tally("alpha") == 103 assert classifier.tally("beta") == 3 summaries = classifier.get_summaries() assert summaries["alpha"].token_tally == 103 assert summaries["beta"].token_tally == 3 assert abs((summaries["alpha"].prob_in_cat + summaries["alpha"].prob_not_in_cat) - 1.0) < 1e-12 hickeroar-simplebayes-4803e70/tests/test_contracts.py000066400000000000000000000020061514723130200230170ustar00rootroot00000000000000from simplebayes.errors import ( InvalidCategoryError, InvalidModelStateError, PayloadTooLargeError, PersistencePathError, SimpleBayesError, UnsupportedModelVersionError, ) from simplebayes.models import CategorySummary, ClassificationResult def test_classification_result_fields(): result = ClassificationResult(category="spam", score=2.5) assert result.category == "spam" assert result.score == 2.5 def test_category_summary_fields(): summary = CategorySummary(token_tally=3, prob_in_cat=0.75, prob_not_in_cat=0.25) assert summary.token_tally == 3 assert summary.prob_in_cat == 0.75 assert summary.prob_not_in_cat == 0.25 def test_error_hierarchy(): assert issubclass(InvalidCategoryError, SimpleBayesError) assert issubclass(PersistencePathError, SimpleBayesError) assert issubclass(UnsupportedModelVersionError, SimpleBayesError) assert issubclass(InvalidModelStateError, SimpleBayesError) assert issubclass(PayloadTooLargeError, SimpleBayesError) hickeroar-simplebayes-4803e70/tests/test_persistence.py000066400000000000000000000127111514723130200233470ustar00rootroot00000000000000import io import json import os import tempfile import pytest from simplebayes import SimpleBayes from simplebayes.errors import ( InvalidCategoryError, InvalidModelStateError, PersistencePathError, UnsupportedModelVersionError, ) from simplebayes.persistence import ( PERSISTED_MODEL_VERSION, dump_model_state, load_model_state_from_file, load_model_state, resolve_model_path, save_model_state_to_file, validate_model_state, ) def test_save_and_load_round_trip_stream(): classifier = SimpleBayes() classifier.train("spam", "buy now limited offer") classifier.train("ham", "team schedule meeting") destination = io.StringIO() classifier.save(destination) destination.seek(0) loaded = SimpleBayes() loaded.load(destination) result = loaded.classify_result("limited offer") assert result.category == "spam" assert result.score > 0 def test_save_and_load_round_trip_file(): classifier = SimpleBayes() classifier.train("alpha", "one two three") with tempfile.TemporaryDirectory() as temp_dir: path = os.path.join(temp_dir, "model.json") classifier.save_to_file(path) loaded = SimpleBayes() loaded.load_from_file(path) assert loaded.tally("alpha") == 3 def test_resolve_model_path_requires_absolute(): with pytest.raises(PersistencePathError): resolve_model_path("relative/path.json") def test_load_model_state_invalid_json(): with pytest.raises(InvalidModelStateError): load_model_state(io.StringIO("{not json")) def test_load_model_state_none_and_non_object(): with pytest.raises(InvalidModelStateError): load_model_state(None) # type: ignore[arg-type] with pytest.raises(InvalidModelStateError): load_model_state(io.StringIO("[]")) def test_dump_model_state_requires_stream(): with pytest.raises(InvalidModelStateError): dump_model_state(None, {}) # type: ignore[arg-type] def test_validate_model_state_errors(): with pytest.raises(UnsupportedModelVersionError): validate_model_state({"version": 999, "categories": {}}) with pytest.raises(InvalidModelStateError): validate_model_state({"version": PERSISTED_MODEL_VERSION, "categories": []}) with pytest.raises(InvalidModelStateError): validate_model_state( { "version": PERSISTED_MODEL_VERSION, "categories": {"alpha": {"tally": 1, "tokens": {"": 1}}}, }, ) with pytest.raises(InvalidModelStateError): validate_model_state( { "version": PERSISTED_MODEL_VERSION, "categories": {"alpha": {"tally": 2, "tokens": {"token": 1}}}, }, ) with pytest.raises(InvalidModelStateError): validate_model_state( { "version": PERSISTED_MODEL_VERSION, "categories": {"alpha": []}, }, ) with pytest.raises(InvalidModelStateError): validate_model_state( { "version": PERSISTED_MODEL_VERSION, "categories": {"alpha": {"tally": -1, "tokens": {"token": 1}}}, }, ) with pytest.raises(InvalidModelStateError): validate_model_state( { "version": PERSISTED_MODEL_VERSION, "categories": {"alpha": {"tally": 1, "tokens": []}}, }, ) with pytest.raises(InvalidModelStateError): validate_model_state( { "version": PERSISTED_MODEL_VERSION, "categories": {"alpha": {"tally": 1, "tokens": {"token": 0}}}, }, ) def test_load_rejects_invalid_payload(): classifier = SimpleBayes() state = { "version": PERSISTED_MODEL_VERSION, "categories": {"bad category": {"tally": 1, "tokens": {"x": 1}}}, } payload = io.StringIO(json.dumps(state)) with pytest.raises(InvalidModelStateError, match="invalid category name"): classifier.load(payload) def test_category_validation_consistent_between_runtime_and_persistence(): for category in ["alpha-1", "A_B", "x" * 64]: assert SimpleBayes.normalize_category(category) == category validate_model_state( { "version": PERSISTED_MODEL_VERSION, "categories": {category: {"tally": 1, "tokens": {"token": 1}}}, }, ) with pytest.raises(InvalidCategoryError): SimpleBayes.normalize_category("bad category") with pytest.raises(InvalidModelStateError, match="invalid category name"): validate_model_state( { "version": PERSISTED_MODEL_VERSION, "categories": {"bad category": {"tally": 1, "tokens": {"token": 1}}}, }, ) def test_save_model_state_cleanup_on_replace_failure(monkeypatch): with tempfile.TemporaryDirectory() as temp_dir: model_path = os.path.join(temp_dir, "model.json") state = {"version": PERSISTED_MODEL_VERSION, "categories": {}} def _raise_replace(_src, _dst): raise RuntimeError("replace failed") monkeypatch.setattr("simplebayes.persistence.os.replace", _raise_replace) with pytest.raises(RuntimeError): save_model_state_to_file(model_path, state) def test_load_model_state_from_file_not_found(): with pytest.raises(FileNotFoundError): load_model_state_from_file("/tmp/simplebayes-missing-model.json") hickeroar-simplebayes-4803e70/tests/test_runtime_readiness.py000066400000000000000000000004631514723130200245440ustar00rootroot00000000000000from simplebayes.runtime.readiness import ReadinessState def test_readiness_state_transitions(): readiness = ReadinessState() assert readiness.is_ready is True readiness.mark_not_ready() assert readiness.is_ready is False readiness.mark_ready() assert readiness.is_ready is True hickeroar-simplebayes-4803e70/tests/test_simplebayes.py000066400000000000000000000002231514723130200233330ustar00rootroot00000000000000"""Pytest-discovered wrapper for legacy classifier unit tests.""" from tests import SimpleBayesTests # noqa: F401 pylint: disable=unused-import hickeroar-simplebayes-4803e70/tests/test_tokenization.py000066400000000000000000000064701514723130200235460ustar00rootroot00000000000000from simplebayes.tokenization import ( _get_stop_words, create_tokenizer, default_tokenize_text, ) def test_default_tokenize_text_empty(): assert default_tokenize_text("") == [] def test_default_tokenize_text_only_separators(): assert default_tokenize_text("!!! ---") == [] def test_default_tokenize_text_normalizes_and_splits(): tokens = default_tokenize_text("Hello, WORLD!! 123") assert tokens == ["hello", "world", "123"] def test_default_tokenize_text_stems_words(): tokens = default_tokenize_text("running runner runs") assert tokens == ["run", "runner", "run"] def test_default_tokenize_text_nfkc_normalization(): tokens = default_tokenize_text("Foo Bar") assert tokens == ["foo", "bar"] def test_default_tokenize_text_handles_combining_marks(): tokens = default_tokenize_text("Cafe\u0301") assert tokens == ["café"] def test_default_tokenize_text_handles_zero_width_spacing(): tokens = default_tokenize_text("alpha\u200bbeta") assert tokens == ["alpha", "beta"] def test_default_tokenize_text_retains_stop_words_by_default(): """Default remove_stop_words=False keeps stop words (backwards compatible).""" tokens = default_tokenize_text("the cat is in the hat") assert "the" in tokens assert "is" in tokens assert "in" in tokens def test_default_tokenize_text_with_remove_stop_words_true_filters_stop_words(): tokens = default_tokenize_text( "the cat is in the hat", remove_stop_words=True ) assert "the" not in tokens assert "is" not in tokens assert "in" not in tokens assert "cat" in tokens or "hat" in tokens # content words retained def test_tokenizer_remove_stop_words_false_retains_all(): tokenize = create_tokenizer(language="english", remove_stop_words=False) tokens = tokenize("the cat is in the hat") assert len(tokens) > 2 # stop words retained def test_create_tokenizer_language_spanish(): tokenize = create_tokenizer(language="spanish", remove_stop_words=True) tokens = tokenize("el gato está en la casa") assert "el" not in tokens assert "la" not in tokens assert len(tokens) >= 2 # content words (gato/casa) retained, possibly stemmed def test_create_tokenizer_language_french_has_stopwords(): """All Snowball languages have built-in stopwords.""" tokenize = create_tokenizer(language="french", remove_stop_words=True) tokens = tokenize("le chat est dans la maison") assert "le" not in tokens assert "la" not in tokens assert "est" not in tokens def test_create_tokenizer_language_yiddish_has_stopwords(): """Yiddish has stopwords from Wiktionary/Wortschatz Leipzig frequency list.""" tokenize = create_tokenizer(language="yiddish", remove_stop_words=True) tokens = tokenize("די וועלט איז גרויס") # "The world is big" assert "די" not in tokens assert "איז" not in tokens assert len(tokens) >= 2 # content words (world, big) retained def test_get_stop_words_unknown_language_returns_empty(): words = _get_stop_words("nonexistentlangxyz123") assert words == set() def test_get_stop_words_caches_result(): """Second call for same language returns cached result.""" first = _get_stop_words("english") second = _get_stop_words("english") assert first is second assert "the" in first