java基础-IO流(网络爬虫/工具包生成假数据)

从网络上爬取姓和名字,组合成男生和女生的姓名,并写入到本地文件中

//获取百家姓的网址: // https://hanyu.baidu.com/shici/detail?from=aladdin\&pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d\&smp_names=termBrand2%2Cpoem1 //获取男孩姓名http://www.haoming8.cn/baobao/10881.html

//获取女孩姓名http://www.haoming8.cn/baobao/7641.html

复制代码
package Day13_IO;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class IOTest01 {
    public static void main(String[] args) throws IOException {
        //获取百家姓的网址:
        // https://hanyu.baidu.com/shici/detail?from=aladdin&pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&smp_names=termBrand2%2Cpoem1
        //获取男孩姓名http://www.haoming8.cn/baobao/10881.html
        //获取女孩姓名http://www.haoming8.cn/baobao/7641.html
        //1、定义变量记录网址
        String familyNameNet="https://hanyu.baidu.com/shici/detail?from=aladdin&pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&smp_names=termBrand2%2Cpoem1";
        String boyName="http://www.haoming8.cn/baobao/10881.html";
        String girlName="http://www.haoming8.cn/baobao/7641.html";

        //2、爬取数据,把网络上的数据拼接成一个字符串
        String familyNameStr = webCrawler(familyNameNet);
        String boyNameStr = webCrawler(boyName);
        String girlNameStr = webCrawler(girlName);
        //3、拼接的字符串并不是目标值,要调用另一个方法,利用正则表达式来处理
        ArrayList<String> familyNameNetList = gatData(familyNameStr, "([\\u4e00-\\u9fa5]{4})([,|。])",1);
        ArrayList<String> boyNameList = gatData(boyNameStr, "([\\u4e00-\\u9fa5]{2})([、|。])",1);
        ArrayList<String> girlNameStrList = gatData(girlNameStr, "(.. ){4}..",0);
        //4、处理数据
        //处理姓氏:把每个姓氏拆开,放到一个新的集合中
        ArrayList<String> familyList=new ArrayList<>();
        for (String s : familyNameNetList) {
            for(int i=0;i<s.length();i++){
                familyList.add(s.charAt(i)+"");
            }
        }
        //处理男生姓名
        //去重重复数据
        ArrayList<String> boyList=new ArrayList<>();
        for (String s : boyNameList) {
            if (!boyList.contains(s)) {
                boyList.add(s);
            }
        }
        //处理女生姓名
        //System.out.println(girlNameStrList);
        ArrayList<String> girlList=new ArrayList<>();
        for (String s : girlNameStrList) {
            String[] split = s.split(" ");
            for(int i=0;i<split.length;i++){
                girlList.add(split[i]);
            }
        }
        //System.out.println(girlList);
        //5、处理数据
        //生成姓名+性别+随机年龄
        ArrayList<String> personName = getName(familyList, boyList, girlList, 70, 70);
        //6、写出数据
        BufferedWriter br=new BufferedWriter(new FileWriter("E:\\java\\javase\\basic\\javadown\\test5.txt"));
        for (String s : personName) {
            br.write(s);
            br.newLine();
        }
        br.close();



    }
    public static ArrayList<String> getName(ArrayList<String> familyList,
                                            ArrayList<String> boyList,
                                            ArrayList<String> girlList,
                                            int boyCount,
                                            int girlCount){
        //1、生成男生不重复的名字
        HashSet<String> boyhs=new HashSet<>();
        while(true){
            if(boyhs.size()==boyCount){
                break;
            }else{
                //随机打乱,然后放到集合中
                Collections.shuffle(familyList);
                Collections.shuffle(boyList);
                boyhs.add(familyList.get(0)+boyList.get(0));
            }
        }
        //2、生成女生不重复的名字
        HashSet<String> girlhs=new HashSet<>();
        while(true){
            if(girlhs.size()==girlCount){
                break;
            }else{
                //随机打乱,然后放到集合中
                Collections.shuffle(familyList);
                Collections.shuffle(girlList);
                girlhs.add(familyList.get(0)+girlList.get(0));
            }
        }
//        //3、最终效果,名字+性别+年龄
        ArrayList<String> list=new ArrayList<>();
        Random random=new Random();
        for (String boy : boyhs) {
            int bogAge = random.nextInt(10) + 8;
            list.add(boy+"-男-"+bogAge);
        }
        for (String girl : girlhs) {
            int girlAge = random.nextInt(8) + 8;
            list.add(girl+"-女-"+girlAge);
        }
        return list;
    }
    //利用正则表达式来提取姓氏,并存储到集合中
    public static ArrayList<String> gatData(String familyNamestr,String regex,int index){
        //创建集合存放数据
        ArrayList<String> list = new ArrayList<>();
        //按照正则表达式的规则去获取数据
        Pattern pattern = Pattern.compile(regex);
        //按照pattern的规则,到familyNamestr当中获取数据
        Matcher matcher = pattern.matcher(familyNamestr);
        while(matcher.find()){
            String group = matcher.group(index);
            list.add(group);
            //System.out.println(group);
        }
        return list;
    }
    //定义一个方法来爬取数据
    public static String webCrawler(String net) throws IOException {
        //4、创建StringBuilder对象,拼接字符串
        StringBuilder sb = new StringBuilder();
        //1、创建一个URL对象
        URL url = new URL(net);
        //2、链接上这个网址
        //细节:保证网络是畅通的,而且这个网址是可以链接上的
        URLConnection conn = url.openConnection();
        //3、读取数据:因为存在中文,先用conn中的一个getInputStream读取字节流,再用转化流将字节流转化为字符流
        InputStreamReader isr = new InputStreamReader(conn.getInputStream());
        int len;
        while((len=isr.read())!=-1){
            //读取后的数据,拼接成一个字符串
            sb.append((char)len);
        }
        //5、关流
        isr.close();
        //6、返回结构
        return sb.toString();
    }
}

2、同样的操作,使用工具包hutool生成姓名假数据

复制代码
package Day13_IO;

import cn.hutool.Hutool;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.ReUtil;
import cn.hutool.http.HttpUtil;

import java.util.*;

public class IOTest02 {
    public static void main(String[] args) {
        //1、定义变量记录网址
        String familyNameNet = "https://hanyu.baidu.com/shici/detail?from=aladdin&pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&smp_names=termBrand2%2Cpoem1";
        String boyName = "http://www.haoming8.cn/baobao/10881.html";
        String girlName = "http://www.haoming8.cn/baobao/7641.html";
        //2、糊涂包get方法调用
        String familyNameNetstr = HttpUtil.get(familyNameNet);
        String boyNamestr = HttpUtil.get(boyName);
        String girlNamestr = HttpUtil.get(girlName);
        //3、利用正则表达式
        List<String> familyNameList = ReUtil.findAll("([\\u4e00-\\u9fa5]{4})([,|。])", familyNameNetstr, 1);
        List<String> boyNameList = ReUtil.findAll("([\\u4e00-\\u9fa5]{2})([、|。])", boyNamestr, 1);
        List<String> girlNameList = ReUtil.findAll("(.. ){4}..", girlNamestr, 0);

        System.out.println(familyNameList);
        System.out.println(boyNameList);
        System.out.println(girlNameList);
        //4、处理数据
        //处理姓氏:把每个姓氏拆开,放到一个新的集合中
        ArrayList<String> familyList = new ArrayList<>();
        for (String s : familyNameList) {
            for (int i = 0; i < s.length(); i++) {
                familyList.add(s.charAt(i) + "");
            }
        }
        //处理男生姓名
        //去重重复数据
        ArrayList<String> boyList = new ArrayList<>();
        for (String s : boyNameList) {
            if (!boyList.contains(s)) {
                boyList.add(s);
            }
        }
        //处理女生姓名
        //System.out.println(girlNameStrList);
        ArrayList<String> girlList = new ArrayList<>();
        for (String s : girlNameList) {
            String[] split = s.split(" ");
            for (int i = 0; i < split.length; i++) {
                girlList.add(split[i]);
            }
        }
        //5、写入数据
        ArrayList<String> personName = getName(familyList, boyList, girlList, 70, 70);
        //6、写出数据
        FileUtil.writeLines(personName,"E:\\java\\javase\\basic\\javadown\\test5.txt","UTF-8");

    }
    public static ArrayList<String> getName(ArrayList<String> familyList,
                                            ArrayList<String> boyList,
                                            ArrayList<String> girlList,
                                            int boyCount,
                                            int girlCount){
        //1、生成男生不重复的名字
        HashSet<String> boyhs=new HashSet<>();
        while(true){
            if(boyhs.size()==boyCount){
                break;
            }else{
                //随机打乱,然后放到集合中
                Collections.shuffle(familyList);
                Collections.shuffle(boyList);
                boyhs.add(familyList.get(0)+boyList.get(0));
            }
        }
        //2、生成女生不重复的名字
        HashSet<String> girlhs=new HashSet<>();
        while(true){
            if(girlhs.size()==girlCount){
                break;
            }else{
                //随机打乱,然后放到集合中
                Collections.shuffle(familyList);
                Collections.shuffle(girlList);
                girlhs.add(familyList.get(0)+girlList.get(0));
            }
        }
//        //3、最终效果,名字+性别+年龄
        ArrayList<String> list=new ArrayList<>();
        Random random=new Random();
        for (String boy : boyhs) {
            int bogAge = random.nextInt(10) + 8;
            list.add(boy+"-男-"+bogAge);
        }
        for (String girl : girlhs) {
            int girlAge = random.nextInt(8) + 8;
            list.add(girl+"-女-"+girlAge);
        }
        return list;
    }
}
相关推荐
桦说编程2 小时前
滑动窗口限流器的演进之路:从调度器实现到 Packed CAS
java·后端·性能优化
开开心心_Every3 小时前
安卓后台录像APP:息屏录存片段,行车用
java·服务器·前端·学习·eclipse·edge·powerpoint
初次攀爬者3 小时前
SpringBoot 整合 JWT + Redis 实现登录鉴权
java·redis·后端
悦悦妍妍3 小时前
spring-ioc
java
佛系打工仔3 小时前
绘制K线第一章:可见区间处理
java
wangkay883 小时前
【Java 转运营】Day02:抖音直播间流量底层逻辑全解析
java·新媒体运营
5***b974 小时前
Spring Boot--@PathVariable、@RequestParam、@RequestBody
java·spring boot·后端
AIGCExplore4 小时前
Jenkins 全局配置及工具验证教程
java·servlet·jenkins