feat(tags): replace regexp based parser with a parser combinator

This commit is contained in:
Paul Chavard 2022-10-04 18:53:28 +02:00
parent 32cdff820f
commit 432c4690b5
5 changed files with 137 additions and 52 deletions

View file

@ -59,6 +59,7 @@ gem 'net-imap', require: false # See https://github.com/mikel/mail/pull/1439
gem 'net-pop', require: false # same
gem 'net-smtp', require: false # same
gem 'openid_connect'
gem 'parsby'
gem 'pg'
gem 'phonelib'
gem 'prawn-rails' # PDF Generation

View file

@ -467,6 +467,7 @@ GEM
webfinger (>= 1.0.1)
orm_adapter (0.5.0)
parallel (1.22.1)
parsby (1.1.1)
parser (3.1.2.0)
ast (~> 2.4.1)
pdf-core (0.9.0)
@ -877,6 +878,7 @@ DEPENDENCIES
net-pop
net-smtp
openid_connect
parsby
pg
phonelib
prawn-rails

View file

@ -4,6 +4,45 @@ module TagsSubstitutionConcern
include Rails.application.routes.url_helpers
include ActionView::Helpers::UrlHelper
module TagsParser
include Parsby::Combinators
extend self
def parse(io)
doc.parse io
end
define_combinator :doc do
many(tag | text) < eof
end
define_combinator :text do
join(many(any_char.that_fail(tag))).fmap do |str|
{ text: str.force_encoding('utf-8').encode }
end
end
define_combinator :tag do
between(tag_delimiter, tag_delimiter, tag_text).fmap do |tag|
{ tag: tag }
end
end
define_combinator :tag_delimiter do
lit('--')
end
define_combinator :tag_text do
join(many(any_char.that_fail(tag_delimiter | eol))).fmap do |str|
str.force_encoding('utf-8').encode
end
end
define_combinator :eol do
lit("\r\n") | lit("\n")
end
end
DOSSIER_TAGS = [
{
libelle: 'motivation',
@ -141,8 +180,6 @@ module TagsSubstitutionConcern
SHARED_TAG_LIBELLES = (DOSSIER_TAGS + DOSSIER_TAGS_FOR_MAIL + INDIVIDUAL_TAGS + ENTREPRISE_TAGS + ROUTAGE_TAGS).map { |tag| tag[:libelle] }
TAG_DELIMITERS_REGEX = /--(?<capture>((?!--).)*)--/
def tags
if procedure.for_individual?
identity_tags = INDIVIDUAL_TAGS
@ -159,7 +196,7 @@ module TagsSubstitutionConcern
end
def used_type_de_champ_tags(text)
used_tags_for(text, with_libelle: true).filter_map do |(tag, libelle)|
used_tags_and_libelle_for(text).filter_map do |(tag, libelle)|
if !tag.in?(SHARED_TAG_LIBELLES)
if tag.start_with?('tdc')
[libelle, tag.gsub('tdc', '').to_i]
@ -170,19 +207,8 @@ module TagsSubstitutionConcern
end
end
def used_tags_for(text, with_libelle: false)
text, tags = normalize_tags(text)
text
.scan(TAG_DELIMITERS_REGEX)
.flatten
.map do |tag_str|
if with_libelle
tag = tags.find { |tag| tag[:id] == tag_str }
[tag_str, tag ? tag[:libelle] : nil]
else
tag_str
end
end
def used_tags_for(text)
used_tags_and_libelle_for(text).map { |(tag, _)| tag }
end
private
@ -250,7 +276,7 @@ module TagsSubstitutionConcern
return ''
end
text, _ = normalize_tags(text)
tokens = parse_tags(text)
tags_and_datas = [
[champ_public_tags(dossier: dossier), dossier.champs],
@ -259,51 +285,68 @@ module TagsSubstitutionConcern
[ROUTAGE_TAGS, dossier],
[INDIVIDUAL_TAGS, dossier.individual],
[ENTREPRISE_TAGS, dossier.etablissement&.entreprise]
]
tags_and_datas
.map { |(tags, data)| [filter_tags(tags), data] }
.reduce(text) { |acc, (tags, data)| replace_tags_with_values_from_data(acc, tags, data) }
].filter_map do |(tags, data)|
data && [filter_tags(tags).index_by { _1[:id].presence || _1[:libelle] }, data]
end
def replace_tags_with_values_from_data(text, tags, data)
if data.present?
tags.reduce(text) do |acc, tag|
replace_tag(acc, tag, data)
end
tags_and_datas.reduce(tokens) do |tokens, (tags, data)|
# Replace tags with their value
tokens.map do |token|
case token
in { tag: _, id: id } if tags.key?(id)
{ text: replace_tag(tags.fetch(id), data) }
in { tag: tag } if tags.key?(tag)
{ text: replace_tag(tags.fetch(tag), data) }
else
token
end
end
end.map do |token|
# Get tokens text representation
case token
in { tag: tag }
"--#{tag}--"
in { text: text }
text
end
end.join('')
end
def replace_tag(text, tag, data)
libelle = Regexp.quote(tag[:id].presence || tag[:libelle])
# allow any kind of space (non-breaking or other) in the tags libellé to match any kind of space in the template
# (the '\\ |' is there because plain ASCII spaces were escaped by preceding Regexp.quote)
libelle.gsub!(/\\ |[[:blank:]]/, "[[:blank:]]")
def replace_tag(tag, data)
if tag.key?(:target)
value = data.send(tag[:target])
data.public_send(tag[:target])
else
value = instance_exec(data, &tag[:lambda])
instance_exec(data, &tag[:lambda])
end
end
text.gsub(/--#{libelle}--/, value.to_s)
def procedure_types_de_champ_tags
filter_tags(types_de_champ_tags(procedure.types_de_champ_public_for_tags, Dossier::SOUMIS) + types_de_champ_tags(procedure.types_de_champ_private_for_tags, Dossier::INSTRUCTION_COMMENCEE))
end
def normalize_tags(text)
tags = types_de_champ_tags(procedure.types_de_champ_public_for_tags, Dossier::SOUMIS) + types_de_champ_tags(procedure.types_de_champ_private_for_tags, Dossier::INSTRUCTION_COMMENCEE)
[filter_tags(tags).reduce(text) { |text, tag| normalize_tag(text, tag) }, tags]
def parse_tags(text)
tags = procedure_types_de_champ_tags.index_by { _1[:libelle] }
TagsParser.parse(text).map do |token|
case token
in { tag: tag } if tags.key?(tag)
{ tag: tag, id: tags.fetch(tag).fetch(:id) }
else
token
end
end
end
def normalize_tag(text, tag)
libelle = Regexp.quote(tag[:libelle])
# allow any kind of space (non-breaking or other) in the tags libellé to match any kind of space in the template
# (the '\\ |' is there because plain ASCII spaces were escaped by preceding Regexp.quote)
libelle.gsub!(/\\ |[[:blank:]]/, "[[:blank:]]")
text.gsub(/--#{libelle}--/, "--#{tag[:id]}--")
def used_tags_and_libelle_for(text)
parse_tags(text).filter_map do |token|
case token
in { tag: tag, id: id }
[id, tag]
in { tag: tag }
[tag]
else
nil
end
end
end
end

View file

@ -29,4 +29,28 @@ namespace :benchmarks do
x.report("Démarche 55824") { PiecesJustificativesService.generate_dossier_export(p_55824.dossiers.limit(10_000)) }
end
end
desc 'Attestation Template parser'
task attestation_template_parser: :environment do
progress = ProgressReport.new(AttestationTemplate.count)
AttestationTemplate.find_each do |template|
parsed = TagsSubstitutionConcern::TagsParser.parse(template.body)
serialized = parsed.map do |token|
case token
in { tag: tag }
"--#{tag}--"
in { text: text }
text
end
end.join('')
if serialized != template.body
throw "Template '#{serialized}' is not eq '#{template.body}' with attestation template #{template.id}"
end
progress.inc
rescue => e
pp "Error with attestation template #{template.id}"
throw e
end
progress.finish
end
end

View file

@ -383,13 +383,13 @@ describe TagsSubstitutionConcern, type: :model do
context 'when generating a document for a dossier that is not termine' do
let(:dossier) { create(:dossier) }
let(:template) { '--motivation-- --date de décision--' }
let(:template) { 'text --motivation-- --date de décision--' }
let(:state) { Dossier.states.fetch(:en_instruction) }
subject { template_concern.send(:replace_tags, template, dossier) }
it "does not treat motivation or date de décision as tags" do
is_expected.to eq('--motivation-- --date de décision--')
is_expected.to eq('text --motivation-- --date de décision--')
end
end
@ -495,4 +495,19 @@ describe TagsSubstitutionConcern, type: :model do
it { is_expected.to eq([["public", procedure.draft_revision.types_de_champ.first.stable_id], ['yolo']]) }
end
describe 'parser' do
it do
tokens = TagsSubstitutionConcern::TagsParser.parse("hello world --public--, --numéro du dossier--, un test--yolo-- encore du text\n---\n encore du text")
expect(tokens).to eq([
{ text: "hello world " },
{ tag: "public" },
{ text: ", " },
{ tag: "numéro du dossier" },
{ text: ", un test" },
{ tag: "yolo" },
{ text: " encore du text\n" + "---\n" + " encore du text" }
])
end
end
end