引擎切换&pdf识别&简历分析

文章目录

1.EasyCode生成interview_history的crud

1.在模板设置中手动指定逻辑删除的值
2.生成代码,进行测试

2.PDF识别关键字

1.引入依赖
xml 复制代码
        <!-- pdf解析器 -->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.24</version>
        </dependency>
2.代码概览
3.PDFUtil.java
java 复制代码
package com.sunxiansheng.interview.server.util;

import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Pattern;

@Slf4j
public class PDFUtil {

    private static Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");

    /**
     * 获取pdf的text
     */
    public static String getPdfText(String pdfUrl) {

        PDDocument document = null;
        String text = "";
        try {
            URL url = new URL(pdfUrl);
            HttpURLConnection htpcon = (HttpURLConnection) url.openConnection();
            htpcon.setRequestMethod("GET");
            htpcon.setDoOutput(true);
            htpcon.setDoInput(true);
            htpcon.setUseCaches(false);
            htpcon.setConnectTimeout(10000);
            htpcon.setReadTimeout(10000);
            InputStream in = htpcon.getInputStream();
            document = PDDocument.load(in);
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setSortByPosition(true);
            stripper.setStartPage(0);
            stripper.setEndPage(Integer.MAX_VALUE);
            text = stripper.getText(document);
            text = pattern.matcher(text).replaceAll("");
            if (log.isInfoEnabled()) {
                log.info("识别到的pdf为{}", text);
            }
        } catch (Exception e) {
            log.error("获取pdf转为文字错误:{}", e.getMessage(), e);
        } finally {
            if (document != null) {
                try {
                    document.close();
                } catch (Exception e) {
                    log.error("close error", e);
                }
            }
        }
        return text;
    }


}
4.keyword
1.EndType.java
java 复制代码
package com.sunxiansheng.interview.server.util.keyword;

/**
 * 结束类型定义
 *
 * @author minghu.zhang
 * @date 11:37 2020/11/11
 **/
public enum EndType {

    /**
     * 有下一个,结束
     */
    HAS_NEXT, IS_END
}
2.FlagIndex.java
java 复制代码
package com.sunxiansheng.interview.server.util.keyword;

import java.util.List;

/**
 * 敏感词标记
 *
 * @author minghu.zhang
 */
public class FlagIndex {

    /**
     * 标记结果
     */
    private boolean flag;
    /**
     * 是否黑名单词汇
     */
    private boolean isWhiteWord;
    /**
     * 标记索引
     */
    private List<Integer> index;

    public boolean isFlag() {
        return flag;
    }

    public void setFlag(boolean flag) {
        this.flag = flag;
    }

    public List<Integer> getIndex() {
        return index;
    }

    public void setIndex(List<Integer> index) {
        this.index = index;
    }

    public boolean isWhiteWord() {
        return isWhiteWord;
    }

    public void setWhiteWord(boolean whiteWord) {
        isWhiteWord = whiteWord;
    }
}
3.WordType.java
java 复制代码
package com.sunxiansheng.interview.server.util.keyword;

/**
 * 词汇类型
 *
 * @author minghu.zhang
 * @date 11:37 2020/11/11
 **/
public enum WordType {

    /**
     * 黑名单/白名单
     */
    BLACK, WHITE
}
4.KeyWordUtil.java
java 复制代码
package com.sunxiansheng.interview.server.util.keyword;

import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;

import java.util.*;

public class KeyWordUtil {

    /**
     * 敏感词字典
     */
    private final static Map wordMap = new HashMap(1024);
    private static boolean init = false;

    public static boolean isInit() {
        return init;
    }

    /**
     * 获取敏感词列表
     *
     * @param text 输入文本
     */
    public static List<String> buildKeyWordsLists(final String text) {

        List<String> wordList = new ArrayList<>();
        char[] charset = text.toCharArray();
        for (int i = 0; i < charset.length; i++) {
            FlagIndex fi = getFlagIndex(charset, i, 0);
            if (fi.isFlag()) {
                if (fi.isWhiteWord()) {
                    i += fi.getIndex().size() - 1;
                } else {
                    StringBuilder builder = new StringBuilder();
                    for (int j : fi.getIndex()) {
                        char word = text.charAt(j);
                        builder.append(word);
                    }
                    wordList.add(builder.toString());
                }
            }
        }
        return wordList;

    }


    /**
     * 获取标记索引
     *
     * @param charset 输入文本
     * @param begin   检测起始
     * @param skip    文本距离
     */
    private static FlagIndex getFlagIndex(final char[] charset, final int begin, final int skip) {

        FlagIndex fi = new FlagIndex();
        Map current = wordMap;
        boolean flag = false;
        int count = 0;
        List<Integer> index = new ArrayList<>();
        for (int i = begin; i < charset.length; i++) {
            char word = charset[i];
            Map mapTree = (Map) current.get(word);
            if (count > skip || (i == begin && Objects.isNull(mapTree))) {
                break;
            }
            if (Objects.nonNull(mapTree)) {
                current = mapTree;
                count = 0;
                index.add(i);
            } else {
                count++;
                if (flag && count > skip) {
                    break;
                }
            }
            if ("1".equals(current.get("isEnd"))) {
                flag = true;
            }
            if ("1".equals(current.get("isWhiteWord"))) {
                fi.setWhiteWord(true);
                break;
            }
        }
        fi.setFlag(flag);
        fi.setIndex(index);
        return fi;

    }

    public static void addWord(Collection<String> wordList) {
        init = true;
        if (CollectionUtils.isEmpty(wordList)) {
            return;
        }
        WordType wordType = WordType.BLACK;
        Map nowMap;
        Map<String, String> newWorMap;
        // 迭代keyWordSet
        for (String key : wordList) {
            nowMap = wordMap;
            for (int i = 0; i < key.length(); i++) {
                // 转换成char型
                char keyChar = key.charAt(i);
                // 获取
                Object wordMap = nowMap.get(keyChar);
                // 如果存在该key,直接赋值
                if (wordMap != null) {
                    nowMap = (Map) wordMap;
                } else {
                    // 不存在则构建一个map,同时将isEnd设置为0,因为他不是最后一个
                    newWorMap = new HashMap<>(4);
                    // 不是最后一个
                    newWorMap.put("isEnd", String.valueOf(EndType.HAS_NEXT.ordinal()));
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }

                if (i == key.length() - 1) {
                    // 最后一个
                    nowMap.put("isEnd", String.valueOf(EndType.IS_END.ordinal()));
                    nowMap.put("isWhiteWord", String.valueOf(wordType.ordinal()));
                }
            }
        }
    }
}

3.策略模式实现引擎切换&简历分析

1.req和vo
1.InterviewReq.java
java 复制代码
package com.sunxiansheng.interview.api.req;

import com.sunxiansheng.interview.api.enums.EngineEnum;
import lombok.Getter;
import lombok.Setter;

import java.io.Serializable;


@Getter
@Setter
public class InterviewReq implements Serializable {

    /**
     * pdf的url
     */
    private String url;

    /**
     * 分析引擎的名字(AI或者本地)
     */
    private String engine = EngineEnum.JI_CHI.name();

}
2.InterviewVO.java
java 复制代码
package com.sunxiansheng.interview.api.vo;

import lombok.Data;
import lombok.Getter;
import lombok.Setter;

import java.io.Serializable;
import java.util.List;


@Getter
@Setter
public class InterviewVO implements Serializable {

    /**
     * 问题列表
     */
    private List<Interview> questionList;

    /**
     * 内部类(具体的问题信息)
     */
    @Data
    public static class Interview {
        /**
         * pdf识别出来的关键词(分类名-标签名)
         */
        private String keyWord;
        /**
         * 标签的分类id
         */
        private Long categoryId;
        /**
         * 标签id
         */
        private Long labelId;
    }

}
2.策略模式准备
1.引擎策略枚举 EngineEnum.java
java 复制代码
package com.sunxiansheng.interview.api.enums;

import lombok.Getter;

/**
 * 引擎
 */
@Getter
public enum EngineEnum {

    JI_CHI,
    ALI_BL,

}
2.引擎策略能力接口 InterviewEngine.java
java 复制代码
package com.sunxiansheng.interview.server.service;


import com.sunxiansheng.interview.api.enums.EngineEnum;
import com.sunxiansheng.interview.api.vo.InterviewVO;

import java.util.List;

/**
 * 引擎能力接口
 */
public interface InterviewEngine {

    /**
     * 标识引擎类型的能力
     */
    EngineEnum engineType();

    /**
     * 通过简历关键字获取面试关键字
     */
    InterviewVO analyse(List<String> KeyWords);

}
3.本地引擎具体策略 JiChiInterviewEngine.java
java 复制代码
package com.sunxiansheng.interview.server.service.impl;

import com.sunxiansheng.interview.api.enums.EngineEnum;
import com.sunxiansheng.interview.api.vo.InterviewVO;
import com.sunxiansheng.interview.server.entity.po.SubjectCategory;
import com.sunxiansheng.interview.server.entity.po.SubjectLabel;
import com.sunxiansheng.interview.server.mapper.SubjectMapper;
import com.sunxiansheng.interview.server.service.InterviewEngine;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;

import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * Description: 本地的引擎
 * @Author sun
 * @Create 2024/7/22 14:23
 * @Version 1.0
 */
@Component
public class JiChiInterviewEngine implements InterviewEngine {

    @Resource
    private SubjectMapper subjectMapper;

    /**
     * 所有的标签和
     */
    private List<SubjectLabel> labels;
    private Map<Long, SubjectCategory> categoryMap;

    // bean装载后初始化
    @PostConstruct
    public void init() {
        labels = subjectMapper.listAllLabel();
        // 收集成map,key为分类id,value为分类对象
        categoryMap = subjectMapper.listAllCategory().stream().collect(Collectors.toMap(
                SubjectCategory::getId, Function.identity()
        ));
    }

    /**
     * 枚举标识自己是本地引擎
     * @return
     */
    @Override
    public EngineEnum engineType() {
        return EngineEnum.JI_CHI;
    }

    /**
     * 根据关键词分析简历
     * @param KeyWords
     * @return
     */
    @Override
    public InterviewVO analyse(List<String> KeyWords) {
        // 判空
        if (CollectionUtils.isEmpty(KeyWords)) {
            return new InterviewVO();
        }
        // 首先过滤出所有是关键词的标签
        List<SubjectLabel> includedLabels = labels.stream().filter(item -> {
            return KeyWords.contains(item.getLabelName());
        }).collect(Collectors.toList());
        // map成InterviewVO.Interview
        List<InterviewVO.Interview> collect = includedLabels.stream().map(
                // label为是关键词的标签
                label -> {
                    InterviewVO.Interview interview = new InterviewVO.Interview();
                    // 根据标签来获取这个标签所在的分类
                    SubjectCategory subjectCategory = categoryMap.get(label.getCategoryId());
                    // 如果分类不为空,则将分类名和标签名format成 "分类名-标签名" 的格式作为KeyWord
                    if (Objects.nonNull(subjectCategory)) {
                        interview.setKeyWord(String.format("%s-%s", subjectCategory.getCategoryName(), label.getLabelName()));
                    } else {
                        interview.setKeyWord(label.getLabelName());
                    }
                    interview.setCategoryId(label.getCategoryId());
                    interview.setLabelId(label.getId());
                    return interview;
                }
        ).collect(Collectors.toList());

        InterviewVO interviewVO = new InterviewVO();
        interviewVO.setQuestionList(collect);
        return interviewVO;
    }
}
3.业务
1.InterviewController.java
java 复制代码
package com.sunxiansheng.interview.server.controller;

import com.alibaba.fastjson.JSON;
import com.google.common.base.Preconditions;
import com.sunxiansheng.interview.api.common.Result;
import com.sunxiansheng.interview.api.req.InterviewReq;
import com.sunxiansheng.interview.api.vo.InterviewVO;
import com.sunxiansheng.interview.server.convert.InterviewHistoryConvert;
import com.sunxiansheng.interview.server.entity.dto.InterviewHistoryDto;
import com.sunxiansheng.interview.server.entity.page.PageResult;
import com.sunxiansheng.interview.server.entity.req.InterviewHistoryReq;
import com.sunxiansheng.interview.server.entity.vo.InterviewHistoryVo;
import com.sunxiansheng.interview.server.service.InterviewHistoryService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.*;

import javax.annotation.Resource;
import java.util.Objects;

/**
 * 模拟面试信息 前端控制器
 *
 * @author sun
 * @since 2024-07-21 16:03:42
 */
@Slf4j
@RestController
@RequestMapping("/interview")
public class InterviewController {
    /**
     * 服务对象
     */
    @Resource
    private InterviewHistoryService interviewHistoryService;

    /**
     * 分页查询数据
     *
     * @param req 筛选条件
     * @return 查询结果
     */
    @GetMapping("/queryPage")
    public Result<PageResult<InterviewHistoryVo>> queryByPage(@RequestBody InterviewHistoryReq req) {
        try {
            // 打印日志
            if (log.isInfoEnabled()) {
                log.info("分页查询数据入参{}", JSON.toJSONString(req));
            }

            // ============================== Preconditions 参数校验 ==============================

            // ============================== Preconditions 参数校验 ==============================

            // 将req转换为dto(如果req的字段符合service层的规范,不转也可以)
            InterviewHistoryDto interviewHistoryDto = InterviewHistoryConvert.INSTANCE.convertReqToDto(req);
            // 调用service层
            PageResult<InterviewHistoryVo> interviewHistoryVoPageResult = this.interviewHistoryService.queryByPage(interviewHistoryDto);
            return Result.ok(interviewHistoryVoPageResult);
        } catch (Exception e) {
            // 打印error日志
            log.error("分页查询数据!错误原因{}", e.getMessage(), e);
            return Result.fail(e.getMessage());
        }
    }

    /**
     * 分析简历
     */
    @PostMapping(value = "/analyse")
    public Result<InterviewVO> analyse(@RequestBody InterviewReq req) {
        try {
            if (log.isInfoEnabled()) {
                log.info("分析简历入参{}", JSON.toJSON(req));
            }
            Preconditions.checkArgument(!Objects.isNull(req), "参数不能为空!");
            Preconditions.checkArgument(!Objects.isNull(req.getEngine()), "引擎不能为空!");
            Preconditions.checkArgument(!Objects.isNull(req.getUrl()), "简历不能为空!");
            return Result.ok(interviewHistoryService.analyse(req));
        } catch (IllegalArgumentException e) {
            log.error("参数异常!错误原因{}", e.getMessage(), e);
            return Result.fail(e.getMessage());
        } catch (Exception e) {
            log.error("分析简历异常!错误原因{}", e.getMessage(), e);
            return Result.fail("分析简历异常!");
        }
    }

}
2.InterviewHistoryService.java
java 复制代码
package com.sunxiansheng.interview.server.service;

import com.sunxiansheng.interview.api.req.InterviewReq;
import com.sunxiansheng.interview.api.vo.InterviewVO;
import com.sunxiansheng.interview.server.entity.dto.InterviewHistoryDto;
import com.sunxiansheng.interview.server.entity.page.PageResult;
import com.sunxiansheng.interview.server.entity.vo.InterviewHistoryVo;

/**
 * 面试汇总记录表(InterviewHistory)service接口
 *
 * @author sun
 * @since 2024-07-21 16:03:42
 */
public interface InterviewHistoryService {

    /**
     * 分页查询
     *
     * @param Dto 筛选条件
     * @return 查询结果
     */
    PageResult<InterviewHistoryVo> queryByPage(InterviewHistoryDto Dto);

    /**
     * 使用引擎分析简历
     *
     * @param req
     * @return
     */
    InterviewVO analyse(InterviewReq req);
}
3.InterviewHistoryServiceImpl.java
java 复制代码
package com.sunxiansheng.interview.server.service.impl;

import com.google.common.base.Preconditions;
import com.sunxiansheng.interview.api.req.InterviewReq;
import com.sunxiansheng.interview.api.vo.InterviewVO;
import com.sunxiansheng.interview.server.convert.InterviewHistoryConvert;
import com.sunxiansheng.interview.server.entity.dto.InterviewHistoryDto;
import com.sunxiansheng.interview.server.entity.page.PageResult;
import com.sunxiansheng.interview.server.entity.page.SunPageHelper;
import com.sunxiansheng.interview.server.entity.po.InterviewHistoryPo;
import com.sunxiansheng.interview.server.entity.po.SubjectLabel;
import com.sunxiansheng.interview.server.entity.vo.InterviewHistoryVo;
import com.sunxiansheng.interview.server.mapper.InterviewHistoryMapper;
import com.sunxiansheng.interview.server.mapper.SubjectMapper;
import com.sunxiansheng.interview.server.service.InterviewEngine;
import com.sunxiansheng.interview.server.service.InterviewHistoryService;
import com.sunxiansheng.interview.server.util.PDFUtil;
import com.sunxiansheng.interview.server.util.keyword.KeyWordUtil;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Service;

import javax.annotation.Resource;
import java.util.*;
import java.util.stream.Collectors;

/**
 * 面试汇总记录表(InterviewHistory)service实现类
 *
 * @author sun
 * @since 2024-07-21 16:27:12
 */
@Service("interviewHistoryService")
public class InterviewHistoryServiceImpl implements InterviewHistoryService, ApplicationContextAware {

    @Resource
    private InterviewHistoryMapper interviewHistoryMapper;

    @Resource
    private SubjectMapper subjectMapper;

    /**
     * 存放所有的引擎策略的map
     */
    private static final Map<String, InterviewEngine> engineMap = new HashMap<>();

    /**
     * 在bean初始化之后立即被调用,这里用来得到所有的引擎对象并封装到map中,方便获取
     *
     * @param applicationContext
     * @throws BeansException
     */
    @Override
    public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
        // 从上下文中获取所有具体的引擎
        Collection<InterviewEngine> engines = applicationContext.getBeansOfType(InterviewEngine.class).values();
        // 将这些引擎放到map中
        for (InterviewEngine engine : engines) {
            engineMap.put(engine.engineType().name(), engine);
        }
    }

    /**
     * 分页查询
     *
     * @param interviewHistoryDto 筛选条件,需要携带pageNo和pageSize以及查询条件
     * @return 分页结果
     */
    @Override
    public PageResult<InterviewHistoryVo> queryByPage(InterviewHistoryDto interviewHistoryDto) {
        // 将dto转换为po
        InterviewHistoryPo interviewHistoryPo = InterviewHistoryConvert.INSTANCE.convertDtoToPo(interviewHistoryDto);
        // 使用 SunPageHelper 执行分页操作
        PageResult<InterviewHistoryPo> paginate = SunPageHelper.paginate(interviewHistoryDto.getPageNo(), interviewHistoryDto.getPageSize(),
                () -> interviewHistoryMapper.count(interviewHistoryPo),
                (offset, size) -> interviewHistoryMapper.queryPage(interviewHistoryPo, offset, size)
        );
        // 将po转换为vo
        PageResult<InterviewHistoryVo> interviewHistoryVoPageResult = InterviewHistoryConvert.INSTANCE.convertPageResult(paginate);
        return interviewHistoryVoPageResult;
    }

    /**
     * 分析简历
     *
     * @param req
     * @return
     */
    @Override
    public InterviewVO analyse(InterviewReq req) {
        // 从pdf中获取关键词
        List<String> keyWords = buildKeyWords(req.getUrl());
        // 从map中获取处理的引擎
        InterviewEngine engine = engineMap.get(req.getEngine());
        Preconditions.checkArgument(!Objects.isNull(engine), "引擎不能为空!");
        // 使用获取到的引擎来分析简历
        return engine.analyse(keyWords);
    }

    /**
     * 分析pdf来获取关键词
     *
     * @param url
     * @return
     */
    private List<String> buildKeyWords(String url) {
        String pdfText = PDFUtil.getPdfText(url);
        if (!KeyWordUtil.isInit()) {
            // 数据库中查询所有标签,作为敏感词放到KeyWordUtil
            List<String> list = subjectMapper.listAllLabel().stream().map(SubjectLabel::getLabelName).collect(Collectors.toList());
            KeyWordUtil.addWord(list);
        }
        // 与数据库中查询出来的敏感词来进行比对,得到关键字列表
        return KeyWordUtil.buildKeyWordsLists(pdfText);
    }

}
4.测试
相关推荐
给自己做减法3 分钟前
排序算法快速记忆
java·算法·排序算法
计算机学姐24 分钟前
基于微信小程序的食堂点餐预约管理系统
java·vue.js·spring boot·mysql·微信小程序·小程序·mybatis
骆晨学长28 分钟前
基于springboot学生健康管理系统的设计与实现
java·开发语言·spring boot·后端·spring
骆晨学长29 分钟前
基于Springboot的医疗健康助手开题报告
java·spring boot·后端
二十雨辰29 分钟前
[苍穹外卖]-09Spring Task定时任务
java·数据库·spring
我是小酒29 分钟前
掌握 Spring:从新手到高手的常见问题汇总
java·后端·spring·springboot
A懿轩A43 分钟前
MySQL SQL多表查询语句各种连接
java·开发语言·数据库·sql·mysql·mybatis
代码代码快快显灵1 小时前
XML标记语言
xml·java·数据库
DKPT1 小时前
数据结构之排序的基本概念
java·数据结构·笔记·学习·算法
MessiGo1 小时前
Python 入门教程(3)基础知识 | 3.3、标识符
java·开发语言·python