antrl4 根据正则表达式生成器1

1、语法和词法文件

参照github 下载地址

1、词法文件 regexLexer.g4

less 复制代码
/*
 * [The "BSD license"]
 *  Copyright (c) 2019 PANTHEON.tech
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The name of the author may not be used to endorse or promote products
 *     derived from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Lexer grammar for https://www.w3.org/TR/2004/REC-xmlschema-2-20041028/#regexs.
 *
 * This grammar is modified in following ways:
 * - we use lexer modes to disambiguate between Char, XmlChar and QuantExact
 * - we use separate lexer tokens to disambiguate positive and negative character groups
 * - XmlCharIncDash is removed in favor of DASH token, which is handled in parser
 */

// $antlr-format alignTrailingComments true, columnLimit 150, maxEmptyLinesToKeep 1, reflowComments false, useTab false
// $antlr-format allowShortRulesOnASingleLine true, allowShortBlocksOnASingleLine true, minEmptyLines 0, alignSemicolons ownLine
// $antlr-format alignColons trailing, singleLineOverrulesHangingColon true, alignLexerCommands true, alignLabels true, alignTrailers true

lexer grammar regexLexer;

LPAREN      : '(';
RPAREN      : ')';
PIPE        : '|';
PLUS        : '+';
QUESTION    : '?';
STAR        : '*';
WildcardEsc : '.';
Char        : ~('.' | '\' | '?' | '*' | '+' | '(' | ')' | '|' | '[' | ']' | '{' | '}'  | '$');

// Quantifier's quantity rule support
StartQuantity: '{' -> pushMode(QUANTITY);
// Single Character Escape
SingleCharEsc: SINGLE_ESC;

// Multi-Character Escape
MultiCharEsc: MULTI_ESC;

// Category Escape
CatEsc   : CAT_ESC   -> pushMode(CATEGORY);
ComplEsc : COMPL_ESC -> pushMode(CATEGORY);

// Positive/Negative Character Group
NegCharGroup : '[^' -> pushMode(CHARGROUP);
PosCharGroup : '['  -> pushMode(CHARGROUP);

mode QUANTITY;
EndQuantity : '}' -> popMode;
QuantExact  : [0-9]+;
COMMA       : ',';

mode CATEGORY;
EndCategory: '}' -> popMode;

// Categories
IsCategory  : Letters | Marks | Numbers | Punctuation | Separators | Symbols | Others;
Letters     : 'L' [ultmo]?;
Marks       : 'M' [nce]?;
Numbers     : 'N' [dlo]?;
Punctuation : 'P' [cdseifo]?;
Separators  : 'Z' [slp]?;
Symbols     : 'S' [mcko]?;
Others      : 'C' [cfon]?;

// Block Escape
IsBlock: 'Is' ([a-z0-9A-Z] | '-')+;

mode CHARGROUP;
NestedSingleCharEsc : SINGLE_ESC;
NestedMultiCharEsc  : MULTI_ESC;
NestedCatEsc        : CAT_ESC   -> pushMode(CATEGORY);
NestedComplEsc      : COMPL_ESC -> pushMode(CATEGORY);
NestedNegCharGroup  : '[^'      -> pushMode(CHARGROUP);
NestedPosCharGroup  : '['       -> pushMode(CHARGROUP);
EndCharGroup        : ']'       -> popMode;
DASH                : '-';
XmlChar             : ~('-' | '[' | ']');

fragment CAT_ESC    : '\p{';
fragment COMPL_ESC  : '\P{';
fragment MULTI_ESC  : '\' [sSiIcCdDwW];
fragment SINGLE_ESC : '\' [nrt\|.?*+(){}\u002D\u005B\u005D\u005E];

2、语法文件 regexParser.g4

css 复制代码
/*
 * [The "BSD license"]
 *  Copyright (c) 2019 PANTHEON.tech
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The name of the author may not be used to endorse or promote products
 *     derived from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Parser grammar for https://www.w3.org/TR/2004/REC-xmlschema-2-20041028/#regexs.
 *
 * This grammar is modified in following ways:
 * - charGroup definition inlines the charClassSub case
 *   This allows us to simplify processing, eliminating one level of nesting. It
 *   also makes this rule consistent with XSD 1.1 definition.
 */

// $antlr-format alignTrailingComments true, columnLimit 150, minEmptyLines 1, maxEmptyLinesToKeep 1, reflowComments false, useTab false
// $antlr-format allowShortRulesOnASingleLine false, allowShortBlocksOnASingleLine true, alignSemicolons hanging, alignColons hanging

parser grammar regexParser;

options {
    tokenVocab = regexLexer;
}

// Parser root context, ensures all input is matched
root
    : regExp EOF
    ;

// Regular Expression
regExp
    : branch (PIPE branch)*
    ;

// Branch
branch
    : piece*
    ;

// Piece
piece
    : atom quantifier?
    ;

// Quantifier
quantifier
    : QUESTION
    | STAR
    | PLUS
    | StartQuantity quantity EndQuantity
    ;

quantity
    : quantRange
    | quantMin
    | QuantExact
    ;

quantRange
    : QuantExact COMMA QuantExact
    ;

quantMin
    : QuantExact COMMA
    ;

// Atom
atom
    : Char
    | charClass
    | (LPAREN regExp RPAREN)
    ;

// Character Class
charClass
    : charClassEsc
    | charClassExpr
    | WildcardEsc
    ;

// Character Class Expression
charClassExpr
    : (NegCharGroup | NestedNegCharGroup | PosCharGroup | NestedPosCharGroup) charGroup EndCharGroup
    ;

// Character Group
// In order to disambiguate the use of DASH's roles in Character Class Subtraction and in posCharGroup
// tail, we explicitly handle it here. ANTLR will consider the subrules in order and they completely
// disambiguate use [a--[f]], [a-[f]], [a-], [a]. We have borrowed some of the clarification from
// https://www.w3.org/TR/2012/REC-xmlschema11-2-20120405/ to make this work
charGroup
    : posCharGroup? DASH DASH charClassExpr
    | posCharGroup DASH charClassExpr
    | posCharGroup DASH?
    | DASH
    ;

// Positive Character Group
posCharGroup
    : DASH? (charRange | charClassEsc)+
    ;

// Character Range, sans the DASH possibility
charRange
    : seRange
    | XmlChar
    ;

seRange
    : charOrEsc DASH charOrEsc
    ;

charOrEsc
    : XmlChar
    | SingleCharEsc
    ;

// Character Class Escape
charClassEsc
    : SingleCharEsc
    | NestedSingleCharEsc
    | MultiCharEsc
    | NestedMultiCharEsc
    | catEsc
    | complEsc
    ;

// Category Escape
catEsc
    : (CatEsc | NestedCatEsc) charProp EndCategory
    ;

complEsc
    : (ComplEsc | NestedComplEsc) charProp EndCategory
    ;

charProp
    : IsCategory
    | IsBlock
    ;

2、根据词法和文法文件生成代码

3、使用访问者模式进行处理

创建 RegVistor2.java 详细过程,参照代码注释

java 复制代码
package com.regex;

import com.regex.g4.regexParser;
import com.regex.g4.regexParserBaseVisitor;
import org.antlr.v4.runtime.tree.TerminalNode;

import java.util.ArrayList;
import java.util.List;
import java.util.Random;

/**
 * 遍历整个正则的语法分析树
 */
public class RegVistor2 extends regexParserBaseVisitor<String> {

    // 随机字符串
    private final static Random R = new Random();

    // 是否包含特殊字符
    private final static Boolean MAYBE_SPECIAL_CHAR = Boolean.TRUE;


    //    root
    //    : regExp EOF
    //            ;
    @Override
    public String visitRoot(regexParser.RootContext ctx) {
        // 直接访问子节点
        return visitRegExp(ctx.regExp());
    }


//    regExp
//    : branch (PIPE branch)*
//    ;
    @Override
    public String visitRegExp(regexParser.RegExpContext ctx) {
        // 或者任意一个分支的结果即可
        int index = R.nextInt(ctx.branch().size());
        // 获取其中一个分支的结果
        return visitBranch(ctx.branch(index));
    }

//    branch
//    : piece*
//    ;
    @Override
    public String visitBranch(regexParser.BranchContext ctx) {
        StringBuilder stringBuilder = new StringBuilder();
        // 遍历piece 结果,即可或者所有的结果
        for (regexParser.PieceContext pieceContext : ctx.piece()) {
            stringBuilder.append(visitPiece(pieceContext));
        }
        return stringBuilder.toString();
    }

//    piece
//    : atom quantifier?
//    ;
    @Override
    public String visitPiece(regexParser.PieceContext ctx) {
        StringBuilder sb = new StringBuilder();
        // 获取字符
        regexParser.AtomContext atom = ctx.atom();
        // quantifier 次数
        regexParser.QuantifierContext quantifier = ctx.quantifier();
        if (quantifier != null) {
            String num = visitQuantifier(quantifier);
            Integer regCount = Integer.valueOf(num);
            for (Integer i = 0; i < regCount; i++) {
                String data = visitAtom(atom);
                sb.append(data);
            }
            return sb.toString();
        }
        // 只有字符就返回生成的字符
        return visitAtom(atom);
    }



//    atom
//    : Char
//    | charClass
//    | (LPAREN regExp RPAREN)
//    ;
    @Override
    public String visitAtom(regexParser.AtomContext ctx) {
        // 字符直接匹配
        if (ctx.Char() != null) {
            return ctx.Char().getText();
        }
        // 其他访问子分支
        return super.visitAtom(ctx);
    }

//    charClass
//    : charClassEsc
//    | charClassExpr
//    | WildcardEsc
//    ;
    @Override
    public String visitCharClass(regexParser.CharClassContext ctx) {
        // 都是优先处理终结符  匹配. 返回一个任意字符
        if (ctx.WildcardEsc() != null) {
            // 返回一个任意字符,
            int[][] range = {{0x4e00, 0x9fa5}, {0x30, 0x39}, {0x61, 0x7a}, {0x41, 0x5a}};
            return getRandomChar(range);
        }
        // 访问其他子节点
        return super.visitCharClass(ctx);
    }

    // 生成随机字符串
    private String getRandomChar(int[][] range) {

        String specialCharStr = "!@#$%^&*()_+|\/?'"',,<>,.·~`";
        // 包含特殊字符
        if (RegVistor2.MAYBE_SPECIAL_CHAR && R.nextBoolean()) {
            return String.valueOf(specialCharStr.charAt(R.nextInt(specialCharStr.length())));
        } else {
            // 随机选择一组
            int[] subRange = range[R.nextInt(range.length)];
            char charValue = (char) (subRange[0] + R.nextInt(subRange[1] - subRange[0] + 1));
            return String.valueOf(charValue);
        }
    }


//    charClassEsc
//    : SingleCharEsc
//    | NestedSingleCharEsc
//    | MultiCharEsc
//    | NestedMultiCharEsc
//    | catEsc
//    | complEsc
//    ;
    @Override
    public String visitCharClassEsc(regexParser.CharClassEscContext ctx) {

        // 单字符
        if (ctx.SingleCharEsc() != null) {
            return ctx.SingleCharEsc().getText();
        }
        // 反逻辑单字符
        if (ctx.NestedSingleCharEsc() != null) {
            return ctx.NestedSingleCharEsc().getText();
        }
        // /d /w 等处理
        if (ctx.MultiCharEsc() != null) {
            return getMultiCharEscStr(ctx.MultiCharEsc().getText());
        }
        // /D 反逻辑处理
        if (ctx.NestedMultiCharEsc() != null) {
            return getMultiCharEscStr(ctx.NestedMultiCharEsc().getText());
        }
        // 访问子节点
       return super.visitCharClassEsc(ctx);
    }

    /**
     * 根据字符,来或者返回的字符
     * @param text
     * @return
     */
    private String getMultiCharEscStr(String text) {

        List<String> rangeList = new ArrayList();
        switch (text) {
            // 空白字符
            case "\s":
                rangeList.add(randomChar(' ', ' '));
                rangeList.add(randomChar('\t', '\t'));
                rangeList.add(randomChar('\r', '\r'));
                rangeList.add(randomChar('\n', '\n'));
                break;
            case "\S":

                rangeList.add(randomChar(0x4e00, 0x9fa5));
                rangeList.add(randomChar(0x30, 0x39));
                rangeList.add(randomChar(0x61, 0x7a));
                rangeList.add(randomChar(0x41, 0x5a));

                break;

            case "\i":
            case "\c":
            case "\C":
            case "\I":
                throw new  RuntimeException("notSupportExp_i_I_c_C");
            case "\d":

                rangeList.add(randomChar(0x30, 0x39));
                break;


            case "\D":
                // \D:表示非数字
                rangeList.add(randomChar(0x4e00, 0x8fa5));
                rangeList.add(randomChar(0x61, 0x7a));
                rangeList.add(randomChar(0x41, 0x5a));
                break;


            case "\w":
                // \w 表示匹配大小写英文字母、数字以及下划线,等价于'[a-za-z0-9_]'。
                rangeList.add(randomChar(0x61, 0x7a));
                rangeList.add(randomChar(0x41, 0x5a));
                rangeList.add(randomChar(0x30, 0x39));

                break;
            case "\W":
                // \W :匹配任何非单词字符,等价于 [^A-Z a-z 0-9_]
                rangeList.add(randomChar(0x4e00, 0x9fa5));
                break;

        }

       return rangeList.get(R.nextInt(rangeList.size()));
    }


    // Character Class Expression
////    charClassExpr
////    : (NegCharGroup | NestedNegCharGroup | PosCharGroup | NestedPosCharGroup) charGroup EndCharGroup
//            ;
    @Override
    public String visitCharClassExpr(regexParser.CharClassExprContext ctx) {

        // 同时有 [^ 或者 [ 暂不支持
        if (ctx.NestedNegCharGroup() != null || ctx.NestedPosCharGroup() != null) {
            throw new RuntimeException("notSupportNestedCharGroup()");
        }
        return visitCharGroup(ctx.charGroup());
    }


//    charGroup
//    : posCharGroup? DASH DASH charClassExpr
//    | posCharGroup DASH charClassExpr
//    | posCharGroup DASH?
//            | DASH
//    ;
    @Override
    public String visitCharGroup(regexParser.CharGroupContext ctx) {
        // 获取一个列表
        List<String> datas = new ArrayList<>();
        datas.add(visitPosCharGroup(ctx.posCharGroup()));
        // 获取任意一个
        return datas.get(R.nextInt(datas.size()));
    }


    // Positive Character Group
//    posCharGroup
//    : DASH? (charRange | charClassEsc)+
//    ;
    @Override
    public String visitPosCharGroup(regexParser.PosCharGroupContext ctx) {
        List<String> datas = new ArrayList<>();
        if(ctx.DASH() != null) {
            datas.add("-");
        }
        // 添加到可选项中
        for (regexParser.CharClassEscContext charClassEscContext : ctx.charClassEsc()) {
            datas.add(visitCharClassEsc(charClassEscContext));
        }
        //  添加到可选项中
        for (regexParser.CharRangeContext charRangeContext : ctx.charRange()) {
            datas.add(visitCharRange(charRangeContext));
        }
        // 获取任意一个
        return datas.get(R.nextInt(datas.size()));
    }



//    charRange
//    : seRange
//    | XmlChar
//    ;

    @Override
    public String visitCharRange(regexParser.CharRangeContext ctx) {

        if (ctx.XmlChar() != null) {
            return ctx.XmlChar().getText();
        }
        // 接着访问子节点
        return super.visitCharRange(ctx);
    }


//    seRange
//    : charOrEsc DASH charOrEsc
//    ;
    @Override
    public String visitSeRange(regexParser.SeRangeContext ctx) {
        // [a-w] [0-9]
        String start = ctx.charOrEsc(0).getText();
        String end = ctx.charOrEsc(1).getText();
        // 返回随机数
       return randomChar(start.charAt(0),end.charAt(0));
    }


//    quantifier
//    : QUESTION
//    | STAR
//    | PLUS
//    | StartQuantity quantity EndQuantity
//    ;
    @Override
    public String visitQuantifier(regexParser.QuantifierContext ctx) {

        // 匹配?,就返回随机值,0或者1
        if (ctx.QUESTION() != null) {
            return String.valueOf(R.nextInt(2));
        }
        // 匹配 * 号,返回 0-15 之间的数字
        if (ctx.STAR() != null) {
            return String.valueOf(R.nextInt(15));
        }
        // 匹配+  1到21 之间的整数,
        if (ctx.PLUS() != null) {
            return String.valueOf(R.nextInt(20) + 1);
        }
        // 有进行处理
        if (ctx.quantity() != null) {
            return visitQuantity(ctx.quantity());
        }
        return "0";
    }

//    quantity
//    : quantRange
//    | quantMin
//    | QuantExact
//    ;
    @Override
    public String visitQuantity(regexParser.QuantityContext ctx) {
        // 这种情况要进行特殊处理下
        if (ctx.QuantExact() != null) {
            return ctx.QuantExact().getText();
        }

        if (ctx.quantMin() != null) {
            return visitQuantMin(ctx.quantMin());
        }

        if (ctx.quantRange() != null) {
            return visitQuantRange(ctx.quantRange());
        }
        return "0";
    }

//    quantRange
//    : QuantExact COMMA QuantExact
//    ;  {4,9} 等

    @Override
    public String visitQuantRange(regexParser.QuantRangeContext ctx) {
        TerminalNode terminalNodeMin = ctx.QuantExact(0);
        TerminalNode terminalNodeMax = ctx.QuantExact(1);

        int min = Integer.parseInt(terminalNodeMin.getText());

        int max = Integer.parseInt(terminalNodeMax.getText());

        if (min > max) {
            throw new RuntimeException("范围错误");
        }
        // 从可选范围中,选中一个
        return String.valueOf(min + R.nextInt(max - min + 1));
    }


//    quantMin
//    : QuantExact COMMA
//            ; {8,}
    @Override
    public String visitQuantMin(regexParser.QuantMinContext ctx) {

        if (ctx.QuantExact() != null) {
            TerminalNode terminalNode = ctx.QuantExact();
            String text = terminalNode.getText();
            int min = Integer.parseInt(text);
            // 最小数 min - min+10
            return String.valueOf(R.nextInt(10) + min);
        }
        return "0";
    }

    /**
     * 生成随机字符串
     * @param start
     * @param end
     * @return
     */
    public String randomChar(int start,int end) {
        Random random = new Random();
        return String.valueOf((char) (start + random.nextInt(end - start + 1)));
    }

}

4、创建工具类

RegexUtil.java

ini 复制代码
public class RegexUtil {

    public static String gen(String regex)  {
        regexLexer regexLexer = new regexLexer(CharStreams.fromString(regex));
        regexLexer.removeErrorListeners();
//        regexLexer.addErrorListener(errorListener);
//        regexLexer.addErrorListener(errorListener);

        CommonTokenStream tokens = new CommonTokenStream(regexLexer);
        regexParser parser = new regexParser(tokens);
        parser.removeErrorListeners();
//        parser.addErrorListener(errorListener);


        regexParser.RootContext root = parser.root();
        RegVistor2 regexVisitor = new RegVistor2();
        return regexVisitor.visitRoot(root);
    }

}

5、创建测试代码

RgxgenT.java

csharp 复制代码
public static void main(String[] args) {

    System.out.println(RegexUtil.gen("0\d{2,3}-\d{7,8}\w+"));

    System.out.println(RegexUtil.gen("[1-9]{8}"));
}

输出如下:

yaml 复制代码
0353-0231554Wzz6Om15BW35W2H43S
64335143

6、问题记录

1、当前还没有处理中文的随机生成

2、对于 [^1-9]的这种反逻辑没有处理

3、错误语法没处理

相关推荐
许野平1 小时前
Rust: 利用 chrono 库实现日期和字符串互相转换
开发语言·后端·rust·字符串·转换·日期·chrono
齐 飞2 小时前
MongoDB笔记01-概念与安装
前端·数据库·笔记·后端·mongodb
LunarCod3 小时前
WorkFlow源码剖析——Communicator之TCPServer(中)
后端·workflow·c/c++·网络框架·源码剖析·高性能高并发
码农派大星。3 小时前
Spring Boot 配置文件
java·spring boot·后端
杜杜的man4 小时前
【go从零单排】go中的结构体struct和method
开发语言·后端·golang
幼儿园老大*4 小时前
走进 Go 语言基础语法
开发语言·后端·学习·golang·go
llllinuuu4 小时前
Go语言结构体、方法与接口
开发语言·后端·golang
cookies_s_s4 小时前
Golang--协程和管道
开发语言·后端·golang
为什么这亚子4 小时前
九、Go语言快速入门之map
运维·开发语言·后端·算法·云原生·golang·云计算
想进大厂的小王4 小时前
项目架构介绍以及Spring cloud、redis、mq 等组件的基本认识
redis·分布式·后端·spring cloud·微服务·架构