Explorar o código

Add check_word_in_academic()

Sergienko Anton %!s(int64=7) %!d(string=hai) anos
pai
achega
5201498dc0
Modificáronse 1 ficheiros con 36 adicións e 97 borrados
  1. 36 97
      src/program_efremova.py

+ 36 - 97
src/program_efremova.py

@@ -210,6 +210,28 @@ def check_word_in_wiktionary(word):
     return answer
 
 
+def check_word_in_academic(word):
+    answer = 'null'
+    try:
+        response = requests.get('https://dic.academic.ru/searchall.php?SWord=' + word)
+        if response.status_code == 200:
+            html = response.text
+
+            if re.search(
+                    re.escape(
+                        word) + r'</a><\/strong> — сущ\.(.*?)<\/p>\n<p class="src"><a href="\/\/dic\.academic\.ru\/contents.nsf\/dic_synonims\/">Словарь синонимов<\/a><\/p>',
+                    html, re.S):
+                answer = 'noun'
+        else:
+            answer = response.status_code
+    except ConnectionError:
+        print("Ошибка: ConnectionError")
+        time.sleep(1)
+    print('answer = {}'.format(answer))
+    print('-------------------------')
+    return answer
+
+
 @function_execution_time
 @if_exist_json
 def check_words_on_sites():
@@ -217,106 +239,23 @@ def check_words_on_sites():
 
     i = 0
     for word, entry in dictionary.items():
-        if (
-                entry['is_noun_by_dictionary'] and
-                entry['is_probably_not_noun'] and
-                entry['answer_from_sites'] == 'null'
-        ):
-            try:
-                response = requests.get('https://ru.wiktionary.org/wiki/' + word)
-                print('{} status_code = {}'.format(word, response.status_code))
-                if response.status_code == 200:
-                    html = response.text
-                    is_noun_by_wiktionary = False
-                    is_not_noun_by_wiktionary = False
-                    if (
-                            'title="существительное">Существительное</a>' in html or
-                            'Существительное.' in html or
-                            'title="выступает в роли существительного">субстантивир.</span>' in html or
-                            ('Существительное' in html and 'Прилагательное' not in html) or
-                            'Существительное, одушевлённое,  тип склонения по ' in html
-                    ):
-                        is_noun_by_wiktionary = True
-
-                    if (
-                            'title="прилагательное">Прилагательное</a>' in html or
-                            'title="причастие">Причастие</a>' in html or
-                            'title="причастие">причастие</a>' in html or
-                            'title="наречие">Наречие</a>' in html or
-                            'title="деепричастие">деепричастие</a>' in html or
-                            ('Существительное' not in html and 'Прилагательное' in html) or
-                            ('Существительное' not in html and 'прилагательного' in html) or
-                            ('Существительное' not in html and 'Местоименное прилагательное' in html) or
-                            ('Существительное' not in html and 'Притяжательное местоимение' in html) or
-                            ('Существительное' not in html and 'Притяжательное прилагательное' in html) or
-                            ('Существительное' not in html and 'Числительное' in html) or
-                            ('Существительное' not in html and 'Порядковое числительное' in html) or
-                            ('Существительное' not in html and 'Местоимение' in html) or
-                            ('Существительное' not in html and 'Указательное местоимение' in html)
-                    ):
-                        is_not_noun_by_wiktionary = True
-
-                    if is_noun_by_wiktionary:
-                        dictionary[word]['answer_from_sites'] = 'noun'
-                        print('answer_from_sites = noun')
-
-                    if is_not_noun_by_wiktionary:
-                        dictionary[word]['answer_from_sites'] = 'not_noun'
-                        print('answer_from_sites = not_noun')
-
-                    if not is_noun_by_wiktionary and not is_not_noun_by_wiktionary:
-                        print('Need more checks')
-                else:
-                    dictionary[word]['answer_from_sites'] = response.status_code
-                    print('url = {}'.format(response.url))
-                print('-------------------------')
-
-                i += 1
-                if i % 100 == 0:
-                    save_json(dictionary)
-            except ConnectionError:
-                print("Error: ConnectionError")
-                time.sleep(1)
+        if 'answerIsProbablyNotNoun' in entry and entry['answerIsProbablyNotNoun'] == 'null':
+            answer = check_word_in_academic(word)
+            if answer != 'null':
+                dictionary[word]['answerIsProbablyNotNoun'] = answer
+        i += 1
+        if i % 100 == 0:
+            save_json(dictionary)
 
     i = 0
     for word, entry in dictionary.items():
-        if (
-                entry['is_noun_by_dictionary'] and
-                entry['is_probably_not_noun'] and
-                entry['answer_from_sites'] == 404
-        ):
-            try:
-                response = requests.get('https://dic.academic.ru/searchall.php?SWord=' + word)
-                print('{} status_code = {}'.format(word, response.status_code))
-                if response.status_code == 200:
-                    html = response.text
-                    is_noun_by_wiktionary = False
-                    is_not_noun_by_wiktionary = False
-
-                    if re.search(
-                            re.escape(
-                                word) + r'</a><\/strong> — сущ\.(.*?)<\/p>\n<p class="src"><a href="\/\/dic\.academic\.ru\/contents.nsf\/dic_synonims\/">Словарь синонимов<\/a><\/p>',
-                            html, re.S):
-                        is_noun_by_wiktionary = True
-
-                    if is_noun_by_wiktionary:
-                        dictionary[word]['answer_from_sites'] = 'noun'
-                        print('answer_from_sites = noun')
-
-                    # if is_not_noun_by_wiktionary:
-                    #     dictionary[word]['answer_from_sites'] = 'not_noun'
-                    #     print('answer_from_sites = not_noun')
-
-                    if not is_noun_by_wiktionary:
-                        print('Need more checks')
-                print('-------------------------')
-
-                i += 1
-                if i % 100 == 0:
-                    save_json(dictionary)
-            except ConnectionError:
-                print("Ошибка: ConnectionError")
-                time.sleep(1)
+        if 'answerIsProbablyNotNoun' in entry and entry['answerIsProbablyNotNoun'] == 'null':
+            answer = check_word_in_academic(word)
+            if answer != 'null':
+                dictionary[word]['answerIsProbablyNotNoun'] = answer
+        i += 1
+        if i % 100 == 0:
+            save_json(dictionary)
 
     save_json(dictionary)
     print('Проверка подозрительных слов завершена')