pax_global_header00006660000000000000000000000064150771754460014532gustar00rootroot0000000000000052 comment=60e70533a17087b11d9b6c99e073d3f48d49c2e5 fcitx5-pinyin-zhwiki-0.3.0/000077500000000000000000000000001507717544600155435ustar00rootroot00000000000000fcitx5-pinyin-zhwiki-0.3.0/LICENSE000066400000000000000000000022731507717544600165540ustar00rootroot00000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to fcitx5-pinyin-zhwiki-0.3.0/Makefile000066400000000000000000000050651507717544600172110ustar00rootroot00000000000000VERSION=20250820 WEB_SLANG_VERSION=20250823 ZHWIKI_FILENAME=zhwiki-$(VERSION)-all-titles-in-ns0 ZHDICT_FILENAME=zhwiktionary-$(VERSION)-all-titles-in-ns0 ZHSRC_FILENAME=zhwikisource-$(VERSION)-all-titles-in-ns0 WEB_SLANG_FILE=web-slang-$(WEB_SLANG_VERSION).txt WEB_SLANG_SOURCE=web-slang-$(WEB_SLANG_VERSION).wikitext .DELETE_ON_ERROR: all: build build: zhwiki.dict zhwiktionary.dict zhwikisource.dict web-slang.dict build_rime_dict: zhwiki.dict.yaml zhwiktionary.dict.yaml zhwikisource.dict.yaml web-slang.dict.yaml download: $(ZHWIKI_FILENAME).gz $(ZHWIKI_FILENAME).gz: wget https://dumps.wikimedia.org/zhwiki/$(VERSION)/$(ZHWIKI_FILENAME).gz $(ZHDICT_FILENAME).gz: wget https://dumps.wikimedia.org/zhwiktionary/$(VERSION)/$(ZHDICT_FILENAME).gz $(ZHSRC_FILENAME).gz: wget https://dumps.wikimedia.org/zhwikisource/$(VERSION)/$(ZHSRC_FILENAME).gz $(WEB_SLANG_SOURCE): ./zhwiki-web-slang.py --fetch > $(WEB_SLANG_SOURCE) $(WEB_SLANG_FILE): $(WEB_SLANG_SOURCE) ./zhwiki-web-slang.py --process $(WEB_SLANG_SOURCE) > $(WEB_SLANG_FILE) %: %.gz gzip -k -d $< zhwiki.source: $(ZHWIKI_FILENAME) cp $< $@ zhwiktionary.source: $(ZHDICT_FILENAME) cp $< $@ zhwikisource.source: $(ZHSRC_FILENAME) cp $< $@ web-slang.source: $(WEB_SLANG_FILE) cp $< $@ %.raw: %.source ./convert.py $< > $@.tmp sort -u $@.tmp > $@ %.dict: %.raw libime_pinyindict $< $@ %.dict.yaml: %.raw sed 's/[ ][ ]*/\t/g' $< > $*.rime.raw sed -i 's/\t0//g' $*.rime.raw sed -i "s/'/ /g" $*.rime.raw printf -- '---\nname: $*\nversion: "0.1"\nsort: by_weight\n...\n' > $@ cat $*.rime.raw >> $@ install-%: %.dict install -Dm644 $< -t $(DESTDIR)/usr/share/fcitx5/pinyin/dictionaries/ install_rime_dict-%: %.dict.yaml install -Dm644 $< -t $(DESTDIR)/usr/share/rime-data/ install: install-zhwiki install-zhwikidictionary install-zhwikisource install-web-slang install_rime_dict: install_rime_dict-zhwiki install_rime_dict-zhwikidictionary install_rime_dict-zhwikisource install_rime_dict-web-slang clean: rm -f $(ZHWIKI_FILENAME).gz $(ZHWIKI_FILENAME) zhwiki.source zhwiki.raw zhwiki.raw.tmp zhwiki.dict zhwiki.dict.yaml zhwiki.rime.raw rm -f $(ZHDICT_FILENAME).gz $(ZHDICT_FILENAME) zhwiktionary.source zhwiktionary.raw zhwiktionary.raw.tmp zhwiktionary.dict zhwiktionary.dict.yaml zhwiktionary.rime.raw rm -f $(ZHSRC_FILENAME).gz $(ZHSRC_FILENAME) zhwikisource.source zhwikisource.raw zhwikisource.raw.tmp zhwikisource.dict zhwikisource.dict.yaml zhwikisource.rime.raw rm -f $(WEB_SLANG_SOURCE) $(WEB_SLANG_FILE) web-slang.source web-slang.raw web-slang.raw.tmp web-slang.dict web-slang.dict.yaml web-slang.rime.raw fcitx5-pinyin-zhwiki-0.3.0/README000066400000000000000000000016331507717544600164260ustar00rootroot00000000000000zhwiki dictionary for fcitx5-pinyin and rime Installation: - Arch Linux: $ sudo pacman -S fcitx5-pinyin-zhwiki - Plum (for RIME): $ rime-install felixonmars/fcitx5-pinyin-zhwiki - Others: Download latest version of "zhwiki.dict" from: https://github.com/felixonmars/fcitx5-pinyin-zhwiki/releases Copy into ~/.local/share/fcitx5/pinyin/dictionaries/ (create the folder if it does not exist) Build Requirements: libime (https://github.com/fcitx/libime/) Python modules: opencc (https://pypi.org/project/OpenCC/) pypinyin (https://pypi.org/project/pypinyin/) regex (https://pypi.org/project/regex/) tenacity (https://pypi.org/project/tenacity/) Manual Build & Installation: make sudo make install Manual Build rime dict & Installation make build_rime_dict sudo make install_rime_dict License: Unlicense Note that the generated dictionary follows Wikimedia's license: https://dumps.wikimedia.org/legal.html fcitx5-pinyin-zhwiki-0.3.0/convert.py000077500000000000000000000046661507717544600176140ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Usage: # convert.py input_filename # input_filename is a file of Wikipedia article titles, one title per line. import logging import regex import sys import opencc from pypinyin import lazy_pinyin # Require at least 2 characters _MINIMUM_LEN = 2 _LIST_PAGE_ENDINGS = [ '列表', '对照表', ] _LOG_EVERY = 1000 _PINYIN_SEPARATOR = '\'' # https://ayaka.shn.hk/hanregex/ # INTERPUNCT \u00b7 -> · # HYPHEN-MINUS \u002d -> - # HYPHEN \u2010 -> ‐ # EN DASH \u2013 -> – # EM DASH \u2014 -> — _HANZI_RE = regex.compile(r"([\p{Unified_Ideograph}\u3006\u3007\u00b7\u002d\u2010\u2013\u2014][\ufe00-\ufe0f\U000e0100-\U000e01ef]?)+") _INTERPUNCT_TRANSTAB = str.maketrans("", "", "·-‐–—") _TO_SIMPLIFIED_CHINESE = opencc.OpenCC('t2s.json') _PINYIN_FIXES = { 'n': 'en', # https://github.com/felixonmars/fcitx5-pinyin-zhwiki/issues/13 } logging.basicConfig(level=logging.INFO) def is_good_title(title, previous_title=None): if not _HANZI_RE.fullmatch(title): return False # Skip single character & too long pages if len(title) < _MINIMUM_LEN: return False # Skip list pages if title.endswith(tuple(_LIST_PAGE_ENDINGS)): return False if previous_title and \ len(previous_title) >= 4 and \ title.startswith(previous_title): return False return True def log_count(count): logging.info(f'{count} words generated') def make_output(word, pinyin): return '\t'.join([word, pinyin, '0']) def main(): previous_title = None result_count = 0 with open(sys.argv[1]) as f: for line in f: title = _TO_SIMPLIFIED_CHINESE.convert(line.strip()) if is_good_title(title, previous_title): stripped_title = title.translate(_INTERPUNCT_TRANSTAB) pinyin = [_PINYIN_FIXES.get(item, item) for item in lazy_pinyin(stripped_title)] pinyin = _PINYIN_SEPARATOR.join(pinyin) if _HANZI_RE.search(pinyin): logging.info( f'Failed to convert to Pinyin. Ignoring: {pinyin}') continue print(make_output(title, pinyin)) result_count += 1 if result_count % _LOG_EVERY == 0: log_count(result_count) previous_title = title log_count(result_count) if __name__ == '__main__': main() fcitx5-pinyin-zhwiki-0.3.0/mediawiki.py000066400000000000000000000034431507717544600200640ustar00rootroot00000000000000import requests import time from datetime import datetime from email.utils import parsedate_to_datetime from tenacity import retry, retry_if_result, stop_after_attempt def is_none_p(value): """Return True if value is None""" return value is None def get_retry_after_delay(response: requests.Response) -> int: """ Parses the Retry-After header from an HTTP response and returns the delay in seconds. """ retry_after = response.headers.get("Retry-After") if not retry_after: return 0 try: # Attempt to parse as an integer (delay in seconds) delay_seconds = int(retry_after) return delay_seconds except ValueError: # If not an integer, attempt to parse as an HTTP date try: retry_date = parsedate_to_datetime(retry_after) now = datetime.now(retry_date.tzinfo) delay_seconds = (retry_date - now).total_seconds() return max(0, int(delay_seconds)) except (ValueError, TypeError): raise def init_session(): # https://wikitech.wikimedia.org/wiki/Robot_policy global session session = requests.Session() headers = { "User-Agent": f"User-Agent: FcitxZhwikiDictBot/1.0 (https://github.com/felixonmars/fcitx5-pinyin-zhwiki) python-requests/{requests.__version__}", "Accept-Encoding": "gzip, deflate, br, zstd", } session.headers.update(headers) @retry(stop=stop_after_attempt(10), retry=retry_if_result(is_none_p)) def do_request(url: str, params=None) -> requests.Response: r = session.get(url, params=params) if r.status_code == 200: return r elif r.status_code == 429: delay = get_retry_after_delay(r) time.sleep(delay) return None else: r.raise_for_status() return None fcitx5-pinyin-zhwiki-0.3.0/recipe.yaml000066400000000000000000000004461507717544600177020ustar00rootroot00000000000000# encoding: utf-8 --- recipe: Rx: zhwiki-dict description: >- Install zhwiki dict from fcitx5-pinyin-zhwiki download_files: >- zhwiki.dict.yaml::https://github.com/felixonmars/fcitx5-pinyin-zhwiki/releases/download/0.2.5/zhwiki-20250823.dict.yaml install_files: >- zhwiki.dict.yaml fcitx5-pinyin-zhwiki-0.3.0/zhwiki-web-slang.py000077500000000000000000000070721507717544600213100ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- import urllib.parse import collections import sys from mediawiki import init_session, do_request def fetch(): # https://www.mediawiki.org/wiki/API:REST_API/Reference#Get_page_source _ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/rest.php/v1/page/" _PAGE = "中国大陆网络用语列表" url = _ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE) init_session() r = do_request(url) page = r.json() return page["source"] def trim_templates(wikitext): template_level = 0 new_wikitext = "" while True: assert template_level >= 0, ValueError("Unbalanced template in wikitext:\n" + wikitext) pre_open, open_tag, post_open = wikitext.partition("{{") pre_close, close_tag, post_close = wikitext.partition("}}") if open_tag and (not close_tag or len(pre_open) < len(pre_close)): # Template starts here ({{) wikitext = post_open if template_level == 0: new_wikitext += pre_open template_level += 1 elif close_tag: # Template ends here (}}) wikitext = post_close template_level -= 1 else: # No more templates assert template_level == 0, ValueError("Unbalanced template in wikitext:\n" + wikitext) # The assertion below must be true on earth assert open_tag == close_tag == "", RuntimeError("Cosmic radiation detected") new_wikitext += wikitext break return new_wikitext def process(wikitext): wikitext = trim_templates(wikitext) words = collections.OrderedDict() def add_word(word): for garbage in ("[", "]", "…", ":", ":", ")", ")", '"', "“", "”", "-{", "}-", "简称", "簡稱"): word = word.replace(garbage, "") words[word.strip()] = None def add_words(word): for word_separator in ("、", "/", "|", ",", "。", "?", "?", "(", "("): if word_separator in word: for w in word.split(word_separator): # recursively resolve add_words(w.strip()) break else: add_word(word) def iter_bolds(line): line_bak = line while "'''" in line: _, sep1, line = line.partition("'''") bold, sep2, line = line.partition("'''") assert sep1 and sep2, ValueError("Unclosed ''' in line: " + line_bak) yield bold for line in wikitext.split("\n"): if not line.startswith("*"): continue # Lists line = line.strip("*").strip() pre_colon, sep, post_colon = line.partition("''':") if not sep: pre_colon, sep, post_colon = line.partition("''':") for bold in iter_bolds(pre_colon + sep): # Add bold words before colon add_words(bold) for bold in iter_bolds(post_colon): # Add bold words after colon (or line w/o colon), skipping the origin of abbreviation (length probably <= 2) if len(bold) > 2: add_words(bold) return words def print_words(words): for word in words: print(word) if __name__ == "__main__": if len(sys.argv) == 1: wikitext = fetch() words = process(wikitext) print_words(words) elif sys.argv[1] == "--fetch": print(fetch()) elif sys.argv[1] == "--process": wikitext = open(sys.argv[2]).read() print_words(process(wikitext)) else: raise NotImplementedError