program_efremova.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. from pathlib import Path
  2. import json
  3. import re
  4. import requests
  5. import time
  6. import os
  7. dictionary_filename = 'efremova.txt'
  8. dictionary_json_filename = 'data.json'
  9. url = 'https://ru.wiktionary.org/wiki/'
  10. def main():
  11. if not is_exist_dictionary():
  12. return
  13. while True:
  14. print('')
  15. print('1 - Clear all temporary files')
  16. print('2 - Generated file {}'.format(dictionary_json_filename))
  17. print('3 - How many articles need to check on {}'.format(url))
  18. print('4 - Check the words on {}'.format(dictionary_json_filename))
  19. print('5 - Print a list of unchecked words on {}'.format(dictionary_json_filename))
  20. print('6 - Print a list of words on {} with 404 error'.format(dictionary_json_filename))
  21. print('10 - Exit')
  22. command = int(input('Enter command number '))
  23. if command == 1:
  24. clear_all_temporary_files()
  25. elif command == 2:
  26. generated_json()
  27. elif command == 3:
  28. how_many_articles_need_to_check()
  29. elif command == 4:
  30. check_words_on_site()
  31. elif command == 5:
  32. print_list_of_words('null')
  33. elif command == 6:
  34. print_list_of_words('404')
  35. elif command == 10:
  36. break
  37. def clear_all_temporary_files():
  38. start = time.time()
  39. file = Path(dictionary_json_filename)
  40. if file.is_file():
  41. os.remove(dictionary_json_filename)
  42. print('All temporary files deleted')
  43. end = time.time()
  44. print_time(start, end)
  45. def generated_json():
  46. start = time.time()
  47. if not is_exist_dictionary():
  48. return
  49. file = Path(dictionary_filename)
  50. with open(file, encoding='utf8') as f:
  51. lines = f.read().splitlines()
  52. dictionary = dict()
  53. for line in lines:
  54. split_line = line.split(' ', 1)
  55. word = split_line[0]
  56. definition = split_line[1]
  57. is_noun_by_dictionary = False
  58. if re.match(r'(ж|м|ср|мн)\.(.*)$', definition) or re.match(r'(1\.|I) (ж|м|ср|мн)\.(.*)$', definition):
  59. is_noun_by_dictionary = True
  60. is_possible_not_noun = False
  61. if (
  62. word.endswith('ая') or
  63. word.endswith('ее') or
  64. word.endswith('ие') or
  65. word.endswith('ий') or
  66. word.endswith('ое') or
  67. word.endswith('ой') or
  68. word.endswith('ые') or
  69. word.endswith('ый') or
  70. word.endswith('ье') or
  71. word.endswith('ьи') or
  72. word.endswith('ья') or
  73. word.endswith('яя')
  74. ):
  75. is_possible_not_noun = True
  76. entry = dict()
  77. entry['definition'] = definition
  78. entry['is_noun_by_dictionary'] = is_noun_by_dictionary
  79. entry['is_possible_not_noun'] = is_possible_not_noun
  80. entry['answer_from_wiktionary'] = 'null'
  81. dictionary[word] = entry
  82. save_json(dictionary)
  83. end = time.time()
  84. print_time(start, end)
  85. def how_many_articles_need_to_check():
  86. start = time.time()
  87. if not is_exist_json():
  88. return
  89. dictionary = read_json()
  90. count_all = 0
  91. count_nouns_by_dictionary = 0
  92. count_check = 0
  93. for word, entry in dictionary.items():
  94. count_all += 1
  95. if entry['is_noun_by_dictionary']:
  96. count_nouns_by_dictionary += 1
  97. if (
  98. entry['is_noun_by_dictionary'] and
  99. entry['is_possible_not_noun'] and
  100. entry['answer_from_wiktionary'] == 'null'
  101. ):
  102. count_check += 1
  103. print('All words: {}'.format(count_all))
  104. print('All nouns by dictionary: {}'.format(count_nouns_by_dictionary))
  105. print('It remains to check words: {}'.format(count_check))
  106. end = time.time()
  107. print_time(start, end)
  108. def print_list_of_words(answer_from_wiktionary):
  109. start = time.time()
  110. if not is_exist_json():
  111. return
  112. dictionary = read_json()
  113. count = 0
  114. for word, entry in dictionary.items():
  115. if entry['is_noun_by_dictionary'] and entry['is_possible_not_noun']:
  116. is_print = False
  117. if answer_from_wiktionary == 'null' and entry['answer_from_wiktionary'] == 'null':
  118. is_print = True
  119. if answer_from_wiktionary == '404' and entry['answer_from_wiktionary'] == 404:
  120. is_print = True
  121. if is_print:
  122. print(word)
  123. print('answer_from_wiktionary = {}'.format(entry['answer_from_wiktionary']))
  124. print('-------------------------')
  125. count += 1
  126. print('Words: {}'.format(count))
  127. end = time.time()
  128. print_time(start, end)
  129. def check_words_on_site():
  130. start = time.time()
  131. if not is_exist_json():
  132. return
  133. dictionary = read_json()
  134. i = 0
  135. for word, entry in dictionary.items():
  136. if (
  137. entry['is_noun_by_dictionary'] and
  138. entry['is_possible_not_noun'] and
  139. entry['answer_from_wiktionary'] == 'null'
  140. ):
  141. try:
  142. response = requests.get(url + word)
  143. print('{} status_code = {}'.format(word, response.status_code))
  144. if response.status_code == 200:
  145. html = response.text
  146. is_noun_by_wiktionary = False
  147. is_not_noun_by_wiktionary = False
  148. if (
  149. 'title="существительное">Существительное</a>' in html or
  150. 'Существительное.' in html or
  151. 'title="выступает в роли существительного">субстантивир.</span>' in html or
  152. ('Существительное' in html and 'Прилагательное' not in html) or
  153. 'Существительное, одушевлённое, тип склонения по ' in html
  154. ):
  155. is_noun_by_wiktionary = True
  156. if (
  157. 'title="прилагательное">Прилагательное</a>' in html or
  158. 'title="причастие">Причастие</a>' in html or
  159. 'title="причастие">причастие</a>' in html or
  160. 'title="наречие">Наречие</a>' in html or
  161. 'title="деепричастие">деепричастие</a>' in html or
  162. ('Существительное' not in html and 'Прилагательное' in html) or
  163. ('Существительное' not in html and 'прилагательного' in html) or
  164. ('Существительное' not in html and 'Местоименное прилагательное' in html) or
  165. ('Существительное' not in html and 'Притяжательное местоимение' in html) or
  166. ('Существительное' not in html and 'Притяжательное прилагательное' in html) or
  167. ('Существительное' not in html and 'Числительное' in html) or
  168. ('Существительное' not in html and 'Порядковое числительное' in html) or
  169. ('Существительное' not in html and 'Местоимение' in html) or
  170. ('Существительное' not in html and 'Указательное местоимение' in html)
  171. ):
  172. is_not_noun_by_wiktionary = True
  173. if is_noun_by_wiktionary:
  174. dictionary[word]['answer_from_wiktionary'] = 'noun'
  175. print('answer_from_wiktionary = noun')
  176. if is_not_noun_by_wiktionary:
  177. dictionary[word]['answer_from_wiktionary'] = 'not_noun'
  178. print('answer_from_wiktionary = not_noun')
  179. if not is_noun_by_wiktionary and not is_not_noun_by_wiktionary:
  180. print('Need more checks')
  181. else:
  182. dictionary[word]['answer_from_wiktionary'] = response.status_code
  183. print('url = {}'.format(response.url))
  184. print('-------------------------')
  185. i += 1
  186. if i % 100 == 0:
  187. save_json(dictionary)
  188. except ConnectionError:
  189. print("Error: ConnectionError")
  190. time.sleep(1)
  191. save_json(dictionary)
  192. print('Analysis of a dictionary using the {} ended'.format(url))
  193. end = time.time()
  194. print_time(start, end)
  195. def print_time(start, end):
  196. print('Function execution time: {}'.format(end - start))
  197. def save_json(dictionary):
  198. file = Path(dictionary_json_filename)
  199. action = 'updated' if file.is_file() else 'created'
  200. with open(dictionary_json_filename, 'w', encoding='utf8') as outfile:
  201. json.dump(dictionary, outfile, ensure_ascii=False, indent=4)
  202. print('File {} {}'.format(dictionary_json_filename, action))
  203. def read_json():
  204. file = Path(dictionary_json_filename)
  205. with open(file, encoding='utf8') as f:
  206. dictionary = json.loads(f.read())
  207. print('File ' + dictionary_json_filename + ' opened')
  208. return dictionary
  209. def is_exist_json():
  210. file = Path(dictionary_json_filename)
  211. if not file.is_file():
  212. print('File {} not exists. This file needs to be generated.'.format(dictionary_json_filename))
  213. return file.is_file()
  214. def is_exist_dictionary():
  215. file = Path(dictionary_filename)
  216. if not file.is_file():
  217. print('File {} not exists. The program cannot work.'.format(dictionary_filename))
  218. return file.is_file()
  219. def test():
  220. pass
  221. if __name__ == '__main__':
  222. main()