`
mushme
  • 浏览: 777635 次
  • 性别: Icon_minigender_1
  • 来自: 西安
社区版块
存档分类
最新评论

选择一个好的驾校,用数据说话,我用python

 
阅读更多
在交管局网站上,看到每月的考试信息都有工具。
想想,可以汇总下,按照考试人数排个名(群众的选择大部分时候都是有道理的)

1.数据先抓取下来
#encoding=utf8
import urllib
import urllib.request
import http.cookiejar
from bs4 import BeautifulSoup
# 导入SQLite驱动:
import sqlite3

User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
header = {}
header['User-Agent'] = User_Agent


vyear='201601'
urlFile = urllib.request.urlopen('https://sn.122.gov.cn/publicitypage?size=20&fzjg=%E9%99%95A&page=1&tjyf='+vyear)
data = urlFile.read()
urlFile.close()
data = data.decode('utf-8',errors='ignore')
print("get page success")
pre = "<div id='pagination'><span  class=''>共"
index1 = data.find(pre) + len(pre)
index2 = data.find('条记录', index1)

ebi=data[index1 : index2];
print("总条数:"+ebi)

totalPage="";
if (int(data[index1 : index2]))%20>0:
    totalPage=str(int(int(data[index1 : index2])/20+1)) 
else:
    totalPage=str(int(int(data[index1 : index2])/20))
print("totalpage:"+totalPage);


def getList(pageId):
    
    hosturl = 'https://sn.122.gov.cn/publicitypage?size=20&fzjg=%E9%99%95A&tjyf='+vyear+'&page='+pageId
    print("getcontent:"+hosturl);

    request = urllib.request.Request(hosturl, None, header)
    response = urllib.request.urlopen(request)
    htmldata = response.read()
    soup = BeautifulSoup(htmldata,"html.parser")
    trs = soup.findAll('tr')
    for x in range(1,len(trs)):
        tr = trs[x]
        tds = tr.findAll("td")
        savejiaxiao(tds[0].contents[0],tds[1].contents[0],tds[2].contents[0],tds[3].contents[0],tds[4].contents[0],tds[5].contents[0]);
        
def savejiaxiao(t_date, t_name,t1,t2,t3,t4):
    conn = sqlite3.connect('jiaxiao.db')
    cursor = conn.cursor()
    # 执行一条SQL语句,创建user表:
    cursor.execute('create table IF NOT EXISTS kaoshi (id INTEGER PRIMARY KEY, t_date varchar(100),t_name varchar(100),t1 varchar(100),t2 varchar(100),t3 varchar(100),t4 varchar(100) )')
    cursor.execute('select * from kaoshi where t_date=\''+t_date+'\' and t_name=\''+t_name+'\'  and t1=\''+t1+'\' and t2=\''+t2+'\' and t3=\''+t3+'\'  and t4=\''+t4+'\' ')
    values=cursor.fetchall()
    if len(values) > 0:#以前就存在
        print('已经存在:'+t_date+','+t_name)
    else:
        try:
            cursor.execute('insert into kaoshi (t_date, t_name,t1,t2,t3,t4) values(?,?,?,?,?,?)',(t_date,t_name,t1,t2,t3,t4)) 
            print("save success."+t_date+','+t_name)
        except Exception  as e:
            print('except:', e)
            
           
# 关闭Cursor:
    cursor.close()
# 提交事务:
    conn.commit()
# 关闭Connection:
    conn.close()        

for x in range(1,int(totalPage)+1): #代表从1到totalPage+1(不包含totalPage)
    errorLink=[]
    try:
        getList(str(x))
    except Exception  as e:
        print('except:', e)
        errorLink.append(x)
print("errorLink:"+str(errorLink));

2.把数据导出到excel里
# -*- coding:utf-8 -*-
import xlwt

# 导入SQLite驱动:
import sqlite3

	
def export():
    conn = sqlite3.connect('jiaxiao2016.db')
    cursor = conn.cursor()
    cursor.execute('select * from kaoshi')
    values = cursor.fetchall()
    i=0
    book=xlwt.Workbook()
    sheet1=book.add_sheet('驾校')
    for line in values:
        row = sheet1.row(i)
        row.write(0,line[0])#id
        row.write(1,line[1])#title
        row.write(2,line[2])#link
        row.write(3,line[3])
        row.write(4,line[4])
        row.write(5,line[5])
        row.write(6,line[6])
        sheet1.col(1).width = 5000
        i=i+1  
    book.save('驾校2016.xls')  
    cursor.close()
    conn.commit()
    conn.close()


export()


3.在excel里做个透视图,按月汇总并排序下,得到结果如下

西安西高驾校科二考场 1870 1241 1158 1095 5364
北所科目一考场 1593 1227 1088 863 4771
郊县分所蓝田县科三考试场 3887 210 343 256 4696
郊县长安科目一考场 2332 1844 15 282 4473
西安支队尚稷路科三考场 2263 764 834 567 4428
西安科目三咸阳吉祥考场 868 928 766 1488 4050
东所科目一考场 2664 300 230 159 3353
西安小型车科目三铜川考场 1424 574 557 588 3143
西安支队户县科目三考场 2107 186 343 338 2974
南所科目一考场 1397 473 511 433 2814
西安天平驾校科二考场 1457 456 890 2803
西安博安驾校科目二考场 842 313 935 417 2507
西安华津驾校科二考场 1494 120 422 291 2327
西所科目一考场 1560 262 193 208 2223
郊县阎良科目一考场 1708 226 1934
西安小型车科目三咸阳西郊考场 5 186 544 783 1518
郊县户县科目一考场 1188 247 21 33 1489
0
1
分享到:
评论
2 楼 mushme 2016-08-11  
jilong-liang 写道

public static void trustEveryone() {  

这个方法的作用是什么?
1 楼 jilong-liang 2016-07-05  
package sm;

import java.net.URL;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.X509TrustManager;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Test {
public static void main(String[] args) throws Exception{

trustEveryone();

for (int i = 1; i <=42; i++) {
String url="https://sn.122.gov.cn/publicitypage?size="+i+"&page=7&tjyf=201601&fzjg=%E9%99%95A&fwdmgl=6003";

Document doc=Jsoup.parse(new URL(url), 3000);

//处理从页面的class=table table-stirped table-hover样式下面的li标签 
            Elements trs=doc.getElementsByAttributeValue("class", "table table-stirped table-hover").select("tr");
            for(Element tr:trs){ 
            String rq = tr.select("td:eq(0)").text();//日期
            String kcmc = tr.select("td:eq(1)").text();//考场名称
            System.out.println(kcmc);
            }
}
}


public static void trustEveryone() {  
        try {   
            HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {   
                public boolean verify(String hostname, SSLSession session) {   
                    return true;   
                }   
            });   
   
            SSLContext context = SSLContext.getInstance("TLS");   
            context.init(null, new X509TrustManager[] { new X509TrustManager() {   
                public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {   
                }   
   
                public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {   
                }   
   
                public X509Certificate[] getAcceptedIssuers() {   
                    return new X509Certificate[0];   
                }   
            } }, new SecureRandom());   
            HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());   
        } catch (Exception e) {   
            e.printStackTrace();   
        }   
    }   
}

相关推荐

Global site tag (gtag.js) - Google Analytics