java抓取网站图片

2019-1-4


CatchWebImg.txt

//抓取网站图片
public class CatchWebImg{
    //网站域名
    static String domainUrl;
 
    public static void main(String[] args) throws Exception{
        String Content = new String();
        String nextUrl = null;
        domainUrl = "";
        //MyUrl类抓根据URL抓取网站源代码
        MyUrl listObj = new MyUrl("");
        //设置抓取返回编码格式
        listObj.setEncoding("utf-8");
        //目标Url正则
        listObj.setTargetRegex("");
        //下一页正则
        listObj.setNextList("");
 
        while(true){
            Content = listObj.getUrlContent();
            //MyUrl类返回正则匹配到的网址
            String[] urls = listObj.getTargetUrls(Content).substring(1).split(",");
            for(int i =   0; i < urls.length; i++){
                System.out.println(urls[i]);
                //访问匹配到的网址
                Thread td = new Thread(new SonFile(domainUrl + urls[i]));
                td.start();
                Thread.sleep(3000);
            }
            //正则获取翻页网址
            nextUrl = listObj.getNextListUrl(Content);
            if(nextUrl == null){
                break;
            }
            //此处根据正则进行填写
            nextUrl = domainUrl + nextUrl.replace("", "");
            listObj.setTargetUrl(nextUrl);
            Thread.sleep(8 * 1000);
        }
    }
}
 
class SonFile implements Runnable{
    private String tmpURL;
    //线程池,抓取某一链接下相关图片的线程
    private static ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(10);
    
    public SonFile(String url){
        this.tmpURL = url;
    }
     
    @Override
    public void run() {
        // TODO Auto-generated method stub
        MyUrl obj = new MyUrl(this.tmpURL);
        //返回网页编码
        obj.setEncoding("utf-8");
        //图片标题
        obj.setTitleRegex("");
        //图片URL正则
        obj.setTargetRegex("");
 
        String[] urls;
        try {
            String Content = obj.getUrlContent();
 
            String title = obj.getBodyTitle(Content);
            try {
                title = title.substring(  0, title.indexOf(" "));
                urls = obj.getTargetUrls(Content).substring(1).split(",");
                System.out.println(title);
                for(int i =   0; i < urls.length; i +=   2){
                    Thread.sleep(600);
                    //抓取图片线程
                    MyTask td = new MyTask(title, urls[i], i /   2);
                    executor.execute(td);
                }
            } catch (Exception e) {
                title = "Error";
            }
             
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
 
class MyTask implements Runnable{
    private String tmpURL;
    private String tmpDirName;
    private int tmpIndex;
     
    public MyTask(String dirName, String url, int index){
        this.tmpDirName = dirName;
        this.tmpURL = url;
        this.tmpIndex = index;
    }
 
    @Override
    public void run() {
        try {
            this.download();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    //保存图片到本地
    public void download(){  
        URL url;
         
        byte[] bs = new byte[4096];   
        int len;   
        String Path = "";
        File dirFile = new File(Path + this.tmpDirName);
        if(!dirFile.exists()){
            dirFile.mkdir();
        }
         
        try {
            url = new URL(this.tmpURL);
            URLConnection con = url.openConnection();
            con.setConnectTimeout(8 * 1000);
            InputStream is = con.getInputStream(); 
             
            OutputStream os = new FileOutputStream(dirFile.getPath().trim() + "\\" + this.tmpIndex + ".jpg");  
            while ((len = is.read(bs)) != -  1) {  
              os.write(bs,   0, len);  
            }  
             
            os.close();  
            is.close();  
        } catch (Exception e) {
            System.err.println("********* " + e.toString() + " *********");
        }    
    }
}

下载地址

百度网盘
密码: