1、语法和词法文件
1、词法文件 regexLexer.g4
less
/*
* [The "BSD license"]
* Copyright (c) 2019 PANTHEON.tech
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Lexer grammar for https://www.w3.org/TR/2004/REC-xmlschema-2-20041028/#regexs.
*
* This grammar is modified in following ways:
* - we use lexer modes to disambiguate between Char, XmlChar and QuantExact
* - we use separate lexer tokens to disambiguate positive and negative character groups
* - XmlCharIncDash is removed in favor of DASH token, which is handled in parser
*/
// $antlr-format alignTrailingComments true, columnLimit 150, maxEmptyLinesToKeep 1, reflowComments false, useTab false
// $antlr-format allowShortRulesOnASingleLine true, allowShortBlocksOnASingleLine true, minEmptyLines 0, alignSemicolons ownLine
// $antlr-format alignColons trailing, singleLineOverrulesHangingColon true, alignLexerCommands true, alignLabels true, alignTrailers true
lexer grammar regexLexer;
LPAREN : '(';
RPAREN : ')';
PIPE : '|';
PLUS : '+';
QUESTION : '?';
STAR : '*';
WildcardEsc : '.';
Char : ~('.' | '\' | '?' | '*' | '+' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '$');
// Quantifier's quantity rule support
StartQuantity: '{' -> pushMode(QUANTITY);
// Single Character Escape
SingleCharEsc: SINGLE_ESC;
// Multi-Character Escape
MultiCharEsc: MULTI_ESC;
// Category Escape
CatEsc : CAT_ESC -> pushMode(CATEGORY);
ComplEsc : COMPL_ESC -> pushMode(CATEGORY);
// Positive/Negative Character Group
NegCharGroup : '[^' -> pushMode(CHARGROUP);
PosCharGroup : '[' -> pushMode(CHARGROUP);
mode QUANTITY;
EndQuantity : '}' -> popMode;
QuantExact : [0-9]+;
COMMA : ',';
mode CATEGORY;
EndCategory: '}' -> popMode;
// Categories
IsCategory : Letters | Marks | Numbers | Punctuation | Separators | Symbols | Others;
Letters : 'L' [ultmo]?;
Marks : 'M' [nce]?;
Numbers : 'N' [dlo]?;
Punctuation : 'P' [cdseifo]?;
Separators : 'Z' [slp]?;
Symbols : 'S' [mcko]?;
Others : 'C' [cfon]?;
// Block Escape
IsBlock: 'Is' ([a-z0-9A-Z] | '-')+;
mode CHARGROUP;
NestedSingleCharEsc : SINGLE_ESC;
NestedMultiCharEsc : MULTI_ESC;
NestedCatEsc : CAT_ESC -> pushMode(CATEGORY);
NestedComplEsc : COMPL_ESC -> pushMode(CATEGORY);
NestedNegCharGroup : '[^' -> pushMode(CHARGROUP);
NestedPosCharGroup : '[' -> pushMode(CHARGROUP);
EndCharGroup : ']' -> popMode;
DASH : '-';
XmlChar : ~('-' | '[' | ']');
fragment CAT_ESC : '\p{';
fragment COMPL_ESC : '\P{';
fragment MULTI_ESC : '\' [sSiIcCdDwW];
fragment SINGLE_ESC : '\' [nrt\|.?*+(){}\u002D\u005B\u005D\u005E];
2、语法文件 regexParser.g4
css
/*
* [The "BSD license"]
* Copyright (c) 2019 PANTHEON.tech
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Parser grammar for https://www.w3.org/TR/2004/REC-xmlschema-2-20041028/#regexs.
*
* This grammar is modified in following ways:
* - charGroup definition inlines the charClassSub case
* This allows us to simplify processing, eliminating one level of nesting. It
* also makes this rule consistent with XSD 1.1 definition.
*/
// $antlr-format alignTrailingComments true, columnLimit 150, minEmptyLines 1, maxEmptyLinesToKeep 1, reflowComments false, useTab false
// $antlr-format allowShortRulesOnASingleLine false, allowShortBlocksOnASingleLine true, alignSemicolons hanging, alignColons hanging
parser grammar regexParser;
options {
tokenVocab = regexLexer;
}
// Parser root context, ensures all input is matched
root
: regExp EOF
;
// Regular Expression
regExp
: branch (PIPE branch)*
;
// Branch
branch
: piece*
;
// Piece
piece
: atom quantifier?
;
// Quantifier
quantifier
: QUESTION
| STAR
| PLUS
| StartQuantity quantity EndQuantity
;
quantity
: quantRange
| quantMin
| QuantExact
;
quantRange
: QuantExact COMMA QuantExact
;
quantMin
: QuantExact COMMA
;
// Atom
atom
: Char
| charClass
| (LPAREN regExp RPAREN)
;
// Character Class
charClass
: charClassEsc
| charClassExpr
| WildcardEsc
;
// Character Class Expression
charClassExpr
: (NegCharGroup | NestedNegCharGroup | PosCharGroup | NestedPosCharGroup) charGroup EndCharGroup
;
// Character Group
// In order to disambiguate the use of DASH's roles in Character Class Subtraction and in posCharGroup
// tail, we explicitly handle it here. ANTLR will consider the subrules in order and they completely
// disambiguate use [a--[f]], [a-[f]], [a-], [a]. We have borrowed some of the clarification from
// https://www.w3.org/TR/2012/REC-xmlschema11-2-20120405/ to make this work
charGroup
: posCharGroup? DASH DASH charClassExpr
| posCharGroup DASH charClassExpr
| posCharGroup DASH?
| DASH
;
// Positive Character Group
posCharGroup
: DASH? (charRange | charClassEsc)+
;
// Character Range, sans the DASH possibility
charRange
: seRange
| XmlChar
;
seRange
: charOrEsc DASH charOrEsc
;
charOrEsc
: XmlChar
| SingleCharEsc
;
// Character Class Escape
charClassEsc
: SingleCharEsc
| NestedSingleCharEsc
| MultiCharEsc
| NestedMultiCharEsc
| catEsc
| complEsc
;
// Category Escape
catEsc
: (CatEsc | NestedCatEsc) charProp EndCategory
;
complEsc
: (ComplEsc | NestedComplEsc) charProp EndCategory
;
charProp
: IsCategory
| IsBlock
;
2、根据词法和文法文件生成代码
3、使用访问者模式进行处理
创建 RegVistor2.java 详细过程,参照代码注释
java
package com.regex;
import com.regex.g4.regexParser;
import com.regex.g4.regexParserBaseVisitor;
import org.antlr.v4.runtime.tree.TerminalNode;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
/**
* 遍历整个正则的语法分析树
*/
public class RegVistor2 extends regexParserBaseVisitor<String> {
// 随机字符串
private final static Random R = new Random();
// 是否包含特殊字符
private final static Boolean MAYBE_SPECIAL_CHAR = Boolean.TRUE;
// root
// : regExp EOF
// ;
@Override
public String visitRoot(regexParser.RootContext ctx) {
// 直接访问子节点
return visitRegExp(ctx.regExp());
}
// regExp
// : branch (PIPE branch)*
// ;
@Override
public String visitRegExp(regexParser.RegExpContext ctx) {
// 或者任意一个分支的结果即可
int index = R.nextInt(ctx.branch().size());
// 获取其中一个分支的结果
return visitBranch(ctx.branch(index));
}
// branch
// : piece*
// ;
@Override
public String visitBranch(regexParser.BranchContext ctx) {
StringBuilder stringBuilder = new StringBuilder();
// 遍历piece 结果,即可或者所有的结果
for (regexParser.PieceContext pieceContext : ctx.piece()) {
stringBuilder.append(visitPiece(pieceContext));
}
return stringBuilder.toString();
}
// piece
// : atom quantifier?
// ;
@Override
public String visitPiece(regexParser.PieceContext ctx) {
StringBuilder sb = new StringBuilder();
// 获取字符
regexParser.AtomContext atom = ctx.atom();
// quantifier 次数
regexParser.QuantifierContext quantifier = ctx.quantifier();
if (quantifier != null) {
String num = visitQuantifier(quantifier);
Integer regCount = Integer.valueOf(num);
for (Integer i = 0; i < regCount; i++) {
String data = visitAtom(atom);
sb.append(data);
}
return sb.toString();
}
// 只有字符就返回生成的字符
return visitAtom(atom);
}
// atom
// : Char
// | charClass
// | (LPAREN regExp RPAREN)
// ;
@Override
public String visitAtom(regexParser.AtomContext ctx) {
// 字符直接匹配
if (ctx.Char() != null) {
return ctx.Char().getText();
}
// 其他访问子分支
return super.visitAtom(ctx);
}
// charClass
// : charClassEsc
// | charClassExpr
// | WildcardEsc
// ;
@Override
public String visitCharClass(regexParser.CharClassContext ctx) {
// 都是优先处理终结符 匹配. 返回一个任意字符
if (ctx.WildcardEsc() != null) {
// 返回一个任意字符,
int[][] range = {{0x4e00, 0x9fa5}, {0x30, 0x39}, {0x61, 0x7a}, {0x41, 0x5a}};
return getRandomChar(range);
}
// 访问其他子节点
return super.visitCharClass(ctx);
}
// 生成随机字符串
private String getRandomChar(int[][] range) {
String specialCharStr = "!@#$%^&*()_+|\/?'"',,<>,.·~`";
// 包含特殊字符
if (RegVistor2.MAYBE_SPECIAL_CHAR && R.nextBoolean()) {
return String.valueOf(specialCharStr.charAt(R.nextInt(specialCharStr.length())));
} else {
// 随机选择一组
int[] subRange = range[R.nextInt(range.length)];
char charValue = (char) (subRange[0] + R.nextInt(subRange[1] - subRange[0] + 1));
return String.valueOf(charValue);
}
}
// charClassEsc
// : SingleCharEsc
// | NestedSingleCharEsc
// | MultiCharEsc
// | NestedMultiCharEsc
// | catEsc
// | complEsc
// ;
@Override
public String visitCharClassEsc(regexParser.CharClassEscContext ctx) {
// 单字符
if (ctx.SingleCharEsc() != null) {
return ctx.SingleCharEsc().getText();
}
// 反逻辑单字符
if (ctx.NestedSingleCharEsc() != null) {
return ctx.NestedSingleCharEsc().getText();
}
// /d /w 等处理
if (ctx.MultiCharEsc() != null) {
return getMultiCharEscStr(ctx.MultiCharEsc().getText());
}
// /D 反逻辑处理
if (ctx.NestedMultiCharEsc() != null) {
return getMultiCharEscStr(ctx.NestedMultiCharEsc().getText());
}
// 访问子节点
return super.visitCharClassEsc(ctx);
}
/**
* 根据字符,来或者返回的字符
* @param text
* @return
*/
private String getMultiCharEscStr(String text) {
List<String> rangeList = new ArrayList();
switch (text) {
// 空白字符
case "\s":
rangeList.add(randomChar(' ', ' '));
rangeList.add(randomChar('\t', '\t'));
rangeList.add(randomChar('\r', '\r'));
rangeList.add(randomChar('\n', '\n'));
break;
case "\S":
rangeList.add(randomChar(0x4e00, 0x9fa5));
rangeList.add(randomChar(0x30, 0x39));
rangeList.add(randomChar(0x61, 0x7a));
rangeList.add(randomChar(0x41, 0x5a));
break;
case "\i":
case "\c":
case "\C":
case "\I":
throw new RuntimeException("notSupportExp_i_I_c_C");
case "\d":
rangeList.add(randomChar(0x30, 0x39));
break;
case "\D":
// \D:表示非数字
rangeList.add(randomChar(0x4e00, 0x8fa5));
rangeList.add(randomChar(0x61, 0x7a));
rangeList.add(randomChar(0x41, 0x5a));
break;
case "\w":
// \w 表示匹配大小写英文字母、数字以及下划线,等价于'[a-za-z0-9_]'。
rangeList.add(randomChar(0x61, 0x7a));
rangeList.add(randomChar(0x41, 0x5a));
rangeList.add(randomChar(0x30, 0x39));
break;
case "\W":
// \W :匹配任何非单词字符,等价于 [^A-Z a-z 0-9_]
rangeList.add(randomChar(0x4e00, 0x9fa5));
break;
}
return rangeList.get(R.nextInt(rangeList.size()));
}
// Character Class Expression
//// charClassExpr
//// : (NegCharGroup | NestedNegCharGroup | PosCharGroup | NestedPosCharGroup) charGroup EndCharGroup
// ;
@Override
public String visitCharClassExpr(regexParser.CharClassExprContext ctx) {
// 同时有 [^ 或者 [ 暂不支持
if (ctx.NestedNegCharGroup() != null || ctx.NestedPosCharGroup() != null) {
throw new RuntimeException("notSupportNestedCharGroup()");
}
return visitCharGroup(ctx.charGroup());
}
// charGroup
// : posCharGroup? DASH DASH charClassExpr
// | posCharGroup DASH charClassExpr
// | posCharGroup DASH?
// | DASH
// ;
@Override
public String visitCharGroup(regexParser.CharGroupContext ctx) {
// 获取一个列表
List<String> datas = new ArrayList<>();
datas.add(visitPosCharGroup(ctx.posCharGroup()));
// 获取任意一个
return datas.get(R.nextInt(datas.size()));
}
// Positive Character Group
// posCharGroup
// : DASH? (charRange | charClassEsc)+
// ;
@Override
public String visitPosCharGroup(regexParser.PosCharGroupContext ctx) {
List<String> datas = new ArrayList<>();
if(ctx.DASH() != null) {
datas.add("-");
}
// 添加到可选项中
for (regexParser.CharClassEscContext charClassEscContext : ctx.charClassEsc()) {
datas.add(visitCharClassEsc(charClassEscContext));
}
// 添加到可选项中
for (regexParser.CharRangeContext charRangeContext : ctx.charRange()) {
datas.add(visitCharRange(charRangeContext));
}
// 获取任意一个
return datas.get(R.nextInt(datas.size()));
}
// charRange
// : seRange
// | XmlChar
// ;
@Override
public String visitCharRange(regexParser.CharRangeContext ctx) {
if (ctx.XmlChar() != null) {
return ctx.XmlChar().getText();
}
// 接着访问子节点
return super.visitCharRange(ctx);
}
// seRange
// : charOrEsc DASH charOrEsc
// ;
@Override
public String visitSeRange(regexParser.SeRangeContext ctx) {
// [a-w] [0-9]
String start = ctx.charOrEsc(0).getText();
String end = ctx.charOrEsc(1).getText();
// 返回随机数
return randomChar(start.charAt(0),end.charAt(0));
}
// quantifier
// : QUESTION
// | STAR
// | PLUS
// | StartQuantity quantity EndQuantity
// ;
@Override
public String visitQuantifier(regexParser.QuantifierContext ctx) {
// 匹配?,就返回随机值,0或者1
if (ctx.QUESTION() != null) {
return String.valueOf(R.nextInt(2));
}
// 匹配 * 号,返回 0-15 之间的数字
if (ctx.STAR() != null) {
return String.valueOf(R.nextInt(15));
}
// 匹配+ 1到21 之间的整数,
if (ctx.PLUS() != null) {
return String.valueOf(R.nextInt(20) + 1);
}
// 有进行处理
if (ctx.quantity() != null) {
return visitQuantity(ctx.quantity());
}
return "0";
}
// quantity
// : quantRange
// | quantMin
// | QuantExact
// ;
@Override
public String visitQuantity(regexParser.QuantityContext ctx) {
// 这种情况要进行特殊处理下
if (ctx.QuantExact() != null) {
return ctx.QuantExact().getText();
}
if (ctx.quantMin() != null) {
return visitQuantMin(ctx.quantMin());
}
if (ctx.quantRange() != null) {
return visitQuantRange(ctx.quantRange());
}
return "0";
}
// quantRange
// : QuantExact COMMA QuantExact
// ; {4,9} 等
@Override
public String visitQuantRange(regexParser.QuantRangeContext ctx) {
TerminalNode terminalNodeMin = ctx.QuantExact(0);
TerminalNode terminalNodeMax = ctx.QuantExact(1);
int min = Integer.parseInt(terminalNodeMin.getText());
int max = Integer.parseInt(terminalNodeMax.getText());
if (min > max) {
throw new RuntimeException("范围错误");
}
// 从可选范围中,选中一个
return String.valueOf(min + R.nextInt(max - min + 1));
}
// quantMin
// : QuantExact COMMA
// ; {8,}
@Override
public String visitQuantMin(regexParser.QuantMinContext ctx) {
if (ctx.QuantExact() != null) {
TerminalNode terminalNode = ctx.QuantExact();
String text = terminalNode.getText();
int min = Integer.parseInt(text);
// 最小数 min - min+10
return String.valueOf(R.nextInt(10) + min);
}
return "0";
}
/**
* 生成随机字符串
* @param start
* @param end
* @return
*/
public String randomChar(int start,int end) {
Random random = new Random();
return String.valueOf((char) (start + random.nextInt(end - start + 1)));
}
}
4、创建工具类
RegexUtil.java
ini
public class RegexUtil {
public static String gen(String regex) {
regexLexer regexLexer = new regexLexer(CharStreams.fromString(regex));
regexLexer.removeErrorListeners();
// regexLexer.addErrorListener(errorListener);
// regexLexer.addErrorListener(errorListener);
CommonTokenStream tokens = new CommonTokenStream(regexLexer);
regexParser parser = new regexParser(tokens);
parser.removeErrorListeners();
// parser.addErrorListener(errorListener);
regexParser.RootContext root = parser.root();
RegVistor2 regexVisitor = new RegVistor2();
return regexVisitor.visitRoot(root);
}
}
5、创建测试代码
RgxgenT.java
csharp
public static void main(String[] args) {
System.out.println(RegexUtil.gen("0\d{2,3}-\d{7,8}\w+"));
System.out.println(RegexUtil.gen("[1-9]{8}"));
}
输出如下:
yaml
0353-0231554Wzz6Om15BW35W2H43S
64335143