from pathlib import Path import json import re import requests import time import os dictionary_filename = 'efremova.txt' json_filename = 'data.json' def if_exist_dictionary(func): def wrapper(): file = Path(dictionary_filename) if not file.is_file(): print('Файл {} не существует. Программа не может быть выполнена'.format(dictionary_filename)) return func() return wrapper def if_exist_json(func): def wrapper(): file = Path(json_filename) if not file.is_file(): print('Файл {} не существует. Его нужно сгенерировать первоначально'.format(json_filename)) return func() return wrapper def function_execution_time(func): def wrapper(): start = time.time() func() end = time.time() print('Время выполнения функции: {}\n'.format(time.strftime('%H:%M:%S', time.gmtime(end - start)))) return wrapper def save_json(dictionary): file = Path(json_filename) action = 'обновлен' if file.is_file() else 'создан' with open(json_filename, 'w', encoding='utf8') as outfile: json.dump(dictionary, outfile, ensure_ascii=False, indent=4) print('Файл {} {}'.format(json_filename, action)) def read_json(): file = Path(json_filename) with open(file, encoding='utf8') as f: dictionary = json.loads(f.read()) print('Файл {} открыт'.format(json_filename)) return dictionary @function_execution_time def remove_all_temporary_files(): def remove(filename): if Path(json_filename).is_file(): os.remove(json_filename) print('Файл {} удален'.format(json_filename)) else: print('Файл {} не существует'.format(json_filename)) remove(json_filename) print('Временных файлов больше нет') @function_execution_time @if_exist_dictionary def generated_json(): file = Path(dictionary_filename) with open(file, encoding='utf8') as f: lines = f.read().splitlines() dictionary = {} for line in lines: split_line = line.split(' ', 1) word = split_line[0] definition = split_line[1] if not bool(re.match(r'(ж|м|ср|мн)\.(.*)$', definition) or re.match(r'(1\.|I) (ж|м|ср|мн)\.(.*)$', definition)): continue is_probably_not_noun = False endings = ['ая', 'ее', 'ие', 'ий', 'ое', 'ой', 'ые', 'ый', 'ье', 'ьи', 'ья', 'яя'] for ending in endings: is_probably_not_noun = is_probably_not_noun or word.endswith(ending) dictionary[word] = {'definition': definition, 'is_probably_not_noun': is_probably_not_noun, 'answer_from_sites': 'null'} save_json(dictionary) @function_execution_time @if_exist_json def how_many_articles_need_to_check(): dictionary = read_json() count_all = 0 count_nouns_by_dictionary = 0 count_check = 0 for word, entry in dictionary.items(): count_all += 1 if entry['is_noun_by_dictionary']: count_nouns_by_dictionary += 1 if ( entry['is_noun_by_dictionary'] and entry['is_probably_not_noun'] and entry['answer_from_sites'] == 'null' ): count_check += 1 print('Все слова: {}'.format(count_all)) print('Количество существительных по Ефремовой: {}'.format(count_nouns_by_dictionary)) print('Нужно проверить на сайтах: {}'.format(count_check)) @function_execution_time @if_exist_json def print_list_of_words(answer_from_sites): dictionary = read_json() count = 0 for word, entry in dictionary.items(): if entry['is_noun_by_dictionary'] and entry['is_probably_not_noun']: is_print = False if answer_from_sites == 'null' and entry['answer_from_sites'] == 'null': is_print = True if answer_from_sites == '404' and entry['answer_from_sites'] == 404: is_print = True if is_print: print(word) print('answer_from_sites = {}'.format(entry['answer_from_sites'])) print('-------------------------') count += 1 print('Слов: {}'.format(count)) @function_execution_time @if_exist_json def check_words_on_sites(): dictionary = read_json() i = 0 for word, entry in dictionary.items(): if ( entry['is_noun_by_dictionary'] and entry['is_probably_not_noun'] and entry['answer_from_sites'] == 'null' ): try: response = requests.get('https://ru.wiktionary.org/wiki/' + word) print('{} status_code = {}'.format(word, response.status_code)) if response.status_code == 200: html = response.text is_noun_by_wiktionary = False is_not_noun_by_wiktionary = False if ( 'title="существительное">Существительное' in html or 'Существительное.' in html or 'title="выступает в роли существительного">субстантивир.' in html or ('Существительное' in html and 'Прилагательное' not in html) or 'Существительное, одушевлённое, тип склонения по ' in html ): is_noun_by_wiktionary = True if ( 'title="прилагательное">Прилагательное' in html or 'title="причастие">Причастие' in html or 'title="причастие">причастие' in html or 'title="наречие">Наречие' in html or 'title="деепричастие">деепричастие' in html or ('Существительное' not in html and 'Прилагательное' in html) or ('Существительное' not in html and 'прилагательного' in html) or ('Существительное' not in html and 'Местоименное прилагательное' in html) or ('Существительное' not in html and 'Притяжательное местоимение' in html) or ('Существительное' not in html and 'Притяжательное прилагательное' in html) or ('Существительное' not in html and 'Числительное' in html) or ('Существительное' not in html and 'Порядковое числительное' in html) or ('Существительное' not in html and 'Местоимение' in html) or ('Существительное' not in html and 'Указательное местоимение' in html) ): is_not_noun_by_wiktionary = True if is_noun_by_wiktionary: dictionary[word]['answer_from_sites'] = 'noun' print('answer_from_sites = noun') if is_not_noun_by_wiktionary: dictionary[word]['answer_from_sites'] = 'not_noun' print('answer_from_sites = not_noun') if not is_noun_by_wiktionary and not is_not_noun_by_wiktionary: print('Need more checks') else: dictionary[word]['answer_from_sites'] = response.status_code print('url = {}'.format(response.url)) print('-------------------------') i += 1 if i % 100 == 0: save_json(dictionary) except ConnectionError: print("Error: ConnectionError") time.sleep(1) i = 0 for word, entry in dictionary.items(): if ( entry['is_noun_by_dictionary'] and entry['is_probably_not_noun'] and entry['answer_from_sites'] == 404 ): try: response = requests.get('https://dic.academic.ru/searchall.php?SWord=' + word) print('{} status_code = {}'.format(word, response.status_code)) if response.status_code == 200: html = response.text is_noun_by_wiktionary = False is_not_noun_by_wiktionary = False if re.search( re.escape( word) + r'<\/strong> — сущ\.(.*?)<\/p>\n