Requires running Docker containers: ``tchewik/isanlp_udpipe`` (syntax), ``tchewik/isanlp_rst:2.0`` (RST)
Usage in Python:
python
from isanlp import PipelineCommon
from isanlp.processor_razdel import ProcessorRazdel
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd
import razdel
put the address here ->
address_syntax = ('', 3134)
address_rst = ('', 3335)
Highly recommended to pre-tokenize texts
def tokenize(text):
""" Tokenize text, but keep paragraph boundaries. """
while '\n\n' in text:
text = text.replace('\n\n', '\n')
result = []
for paragraph in text.split('\n'):
result.append(' '.join([tok.text for tok in razdel.tokenize(paragraph)]))
return '\n'.join(result).strip()
ppl = PipelineCommon([
(ProcessorRazdel(), ['text'],
{'tokens': 'tokens',
'sentences': 'sentences'}),
(ProcessorRemote(address_syntax[0], address_syntax[1], '0'),
['tokens', 'sentences'],
{'lemma': 'lemma',
'syntax_dep_tree': 'syntax_dep_tree',
'postag': 'ud_postag'}),
(ProcessorMystem(delay_init=False),
['tokens', 'sentences'],
{'postag': 'postag'}),
(ConverterMystemToUd(),
['postag'],
{'morph': 'morph',
'postag': 'postag'}),
(ProcessorRemote(address_rst[0], address_rst[1], 'default'),
['text', 'tokens', 'sentences', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
{'rst': 'rst'})
])