利用Gecco爬取(蚂蚁短租网)数据

  • 代码运行效果:

  • 此案例抓取采用单线程
  • 此案例抓取页面的domain类没有粘贴有点小多
  • 下载地址:https://download.csdn.net/download/qq_18600061/10781994
  • 抓取的数据可以用来学习ElasticSearch
  • 此代码还可以优化(我也是初学)
  • POM依赖:
<dependency><groupId>com.geccocrawler</groupId><artifactId>gecco</artifactId><version>1.1.0</version></dependency><dependency><groupId>com.geccocrawler</groupId><artifactId>gecco-htmlunit</artifactId><version>1.0.5</version></dependency><dependency><groupId>com.alibaba</groupId><artifactId>fastjson</artifactId><version>1.2.38</version></dependency><dependency><groupId>com.belerweb</groupId><artifactId>pinyin4j</artifactId><version>2.5.0</version></dependency><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><version>5.1.34</version></dependency>
  • 代码工程结构(如下图)

1.蚂蚁短租网:城市列表常量类:

package org.ssgroup.spider.constant;import java.util.LinkedHashMap;
import java.util.Map;/*** @date 2018-11-09*/
public class MaYiAllCityConstant {public static boolean on = true;/*** 蚂蚁短租所有城市*/public static Map<String,String[]> ALL_CITY = new LinkedHashMap<String,String[]>();/*** 蚂蚁短租位置类型*/public static String[] CITY_LOCATION = new String[] {"商圈","景点 ","行政区 ","车站机场 ","地铁","学校","医院"};static {//国内热门城市String[] holdCity = new String[] {"北京","上海","青岛","香港","成都","杭州","台北","三亚","大连","广州","西安","重庆","厦门","秦皇岛","屏东","花莲","深圳","南京","苏州","烟台","威海","天津","北海","武汉","葫芦岛","长沙","哈尔滨","沈阳","昆明","营口"};ALL_CITY.put("holdCity", holdCity);//ABCDString[] abcd = new String[] {"鞍山","安阳","阿拉善","澳门","安顺","安庆","安康","阿勒泰","北京","北戴河","北海","保定","本溪","包头","白山","宝鸡","蚌埠","博鳌","保山","百色","巴音郭楞","博尔塔拉","保亭","白沙","成都","重庆","长沙","长春","承德","常州","长白山","赤峰","沧州","郴州","长治","潮州","常德","滁州","池州","昌吉","昌江","大连","东戴河","丹东","大理","东莞","大同","德州","德阳","敦煌","东营","迪庆","都江堰","东方","德宏","定西","儋州","大兴安岭"};ALL_CITY.put("ABCD", abcd);//EFGHString[] efgh = new String[] {"洱海","峨眉山","恩施","鄂尔多斯","鄂州","福州","佛山","防城港","凤凰","抚顺","阜阳","抚州","阜新","广州","贵阳","高雄","桂林","赣州","甘孜","广元","贵港","鼓浪屿","甘南","杭州","花莲","葫芦岛","哈尔滨","合肥","海口","惠州","湖州","莫干山","呼和浩特","黄山","呼伦贝尔","横店","邯郸","衡水","淮安","衡阳","黑河","汉中","菏泽","红河","河源","黄龙","海西","海北","贺州","淮北","怀化","河池","黄石","海拉尔","淮南","海东","鹤岗","和田"};ALL_CITY.put("EFGH", efgh);//JKLMString[] jklm = new String[] {"济南","基隆","嘉义","金门","嘉兴","锦州","吉林","济宁","九江","江门","焦作","景德镇","佳木斯","荆州","嘉峪关","九寨沟","晋城","吉安","揭阳","鸡西","济源","昆明","开封","克拉玛依","库尔勒","丽江","临高","兰州","洛阳","临沂","乐山","廊坊","连云港","柳州","庐山","凉山","聊城","拉萨","临汾","丽水","六盘水","泸州","龙岩","吕梁","陵水","辽源","陇南","林芝","临夏","乐东","临沧","苗栗","马祖","绵阳","茂名","梅州","眉山","马鞍山"};ALL_CITY.put("JKLM", jklm);//NPQRString[] npqr = new String[] {"南戴河","南京","南投","宁波","南宁","南昌","南通","南阳","南充","南平","宁德","内江","屏东","澎湖","普陀山","盘锦","平遥","莆田","平顶山","蓬莱","攀枝花","普洱","萍乡","平凉","青岛","千岛湖","秦皇岛","泉州","清远","黔东南","齐齐哈尔","衢州","黔南","黔西南","曲靖","钦州","日照","日喀则"};ALL_CITY.put("NPQR", npqr);//STWString[] stw = new String[] {"上海","上海迪士尼","三亚","深圳","苏州","沈阳","石家庄","绍兴","汕头","上饶","韶关","四平","三明","松原","十堰","神农架","遂宁","石河子","绥化","商洛","随州","三沙","台北","天涯海角","同里","天津","台东","台南","台中","桃园","太原","唐山","泰安","泰山","台州","通化","泰州","天水","通辽","吐鲁番","塔城","屯昌","威海","武汉","温州","无锡","乌鲁木齐","潍坊","武夷山","芜湖","文昌","梧州","渭南","文山","万宁","武威","五指山"};ALL_CITY.put("STW", stw);//XYZsString[] xyz = new String[] {"香港","西湖","西安","厦门","新北","新竹","西塘","西宁","徐州","西双版纳","新乡","雪乡","咸阳","邢台","湘西","湘潭","信阳","锡林郭勒","许昌","忻州","宣城","襄樊","兴安","宿迁","咸宁","宿州","孝感","烟台","营口","云林","宜兰","扬州","银川","延边","阳江","宜昌","盐城","宜宾","延安","运城","玉溪","伊春","伊犁","雅安","宜春","岳阳","玉林","榆林","益阳","阳朔","洋浦","周庄","郑州","珠海","彰化","舟山","中山","张家口","张家界","漳州","湛江","淄博","遵义","枣庄","镇江","株洲","肇庆","自贡","张掖","中卫","周口","驻马店","昭通","资阳"};ALL_CITY.put("XYZ", xyz);}
}

2.蚂蚁短租列表页抓取页面代码:

package org.ssgroup.spider.htmlBean;import java.util.List;import org.ssgroup.spider.htmlBean.domain.MaYiRoom;
import org.ssgroup.spider.htmlBean.domain.Page;
import org.ssgroup.spider.htmlBean.domain.list.CarOrAirport;
import org.ssgroup.spider.htmlBean.domain.list.Hospital;
import org.ssgroup.spider.htmlBean.domain.list.OfficeAreas;
import org.ssgroup.spider.htmlBean.domain.list.Offices;
import org.ssgroup.spider.htmlBean.domain.list.ScenicArea;
import org.ssgroup.spider.htmlBean.domain.list.School;
import org.ssgroup.spider.htmlBean.domain.list.ShopLoops;
import org.ssgroup.spider.htmlBean.domain.list.SubWayLine;
import org.ssgroup.spider.htmlBean.domain.list.SubWayStation;import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.annotation.Request;
import com.geccocrawler.gecco.annotation.RequestParameter;
import com.geccocrawler.gecco.annotation.Text;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HtmlBean;/*** 蚂蚁短租列表页* @author HX-011* @date 2018-11-09*/
@Gecco(matchUrl="http://www.mayi.com/{city}/{code}", pipelines="maYiListPipeline")
public class MaYiListHtmlBean implements HtmlBean{private static final long serialVersionUID = -5332646457923675928L;@Requestprivate HttpRequest request;/*** 城市参数*/@RequestParameter("city")private String city;/*** 请求分页参数*/@RequestParameter("code")private String code;/*** 页面分页参数*/@Text@HtmlField(cssPath="#page > a.pg-active")private String page;/*** 分页总数*/@HtmlField(cssPath="#page > input[type=hidden]")private List<Page> pages;/*** 获取所有房源数据*/@HtmlField(cssPath="#searchRoom > dd")private List<MaYiRoom> room;/*** 位置类型*/
//  @Text
//  @HtmlField(cssPath="#position > div.rt-word.position_choose > div.item.next > div > a")
//  private List<String> locations;/*** 商圈:type=1*/@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-business.pr60 > div > span")private List<ShopLoops> shopLoops;/*** 景点:type=2*/@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-scenic > div > a")private List<ScenicArea> scenicAreas;/*** 行政区:type=3*/@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-area > a")private List<OfficeAreas> officeAreas;/*** 行政区:子区域*/@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-area > div")private List<Offices> offices;/*** 车站机场:type=4*/@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-traffic > div > span")private List<CarOrAirport> carOrAirport;/*** 地铁:type=5*/@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-metro > a")private List<SubWayLine> subWayLine;/*** 地铁站详细:*/@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-metro > div")private List<SubWayStation> subWayStation;/*** 学校:type=6*/@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-school > div > a")private List<School> school;/*** 医院:type=7*/@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-hospital > div > a")private List<Hospital> hospital;public HttpRequest getRequest() {return request;}public void setRequest(HttpRequest request) {this.request = request;}public String getCity() {return city;}public void setCity(String city) {this.city = city;}public String getPage() {return page;}public void setPage(String page) {this.page = page;}public void setCode(String code) {this.code = code;}public String getCode() {return code;}public void setRoom(List<MaYiRoom> room) {this.room = room;}public List<MaYiRoom> getRoom() {return room;}public void setPages(List<Page> pages) {this.pages = pages;}public List<Page> getPages() {return pages;}public List<ShopLoops> getShopLoops() {return shopLoops;}public void setShopLoops(List<ShopLoops> shopLoops) {this.shopLoops = shopLoops;}public List<ScenicArea> getScenicAreas() {return scenicAreas;}public void setScenicAreas(List<ScenicArea> scenicAreas) {this.scenicAreas = scenicAreas;}public List<OfficeAreas> getOfficeAreas() {return officeAreas;}public void setOfficeAreas(List<OfficeAreas> officeAreas) {this.officeAreas = officeAreas;}public void setOffices(List<Offices> offices) {this.offices = offices;}public List<Offices> getOffices() {return offices;}public List<CarOrAirport> getCarOrAirport() {return carOrAirport;}public void setCarOrAirport(List<CarOrAirport> carOrAirport) {this.carOrAirport = carOrAirport;}public void setSubWayLine(List<SubWayLine> subWayLine) {this.subWayLine = subWayLine;}public List<SubWayLine> getSubWayLine() {return subWayLine;}public void setSubWayStation(List<SubWayStation> subWayStation) {this.subWayStation = subWayStation;}public List<SubWayStation> getSubWayStation() {return subWayStation;}public List<School> getSchool() {return school;}public void setSchool(List<School> school) {this.school = school;}public List<Hospital> getHospital() {return hospital;}public void setHospital(List<Hospital> hospital) {this.hospital = hospital;}
//  public void setLocations(List<String> locations) {
//      this.locations = locations;
//  }
//  public List<String> getLocations() {
//      return locations;
//  }
}

3.蚂蚁短租列表页抓取代码分页抓取和入库抓取列表页数据:

package org.ssgroup.spider.service;import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;import org.apache.commons.lang3.StringUtils;
import org.ssgroup.spider.Application;
import org.ssgroup.spider.constant.MaYiAllCityConstant;
import org.ssgroup.spider.htmlBean.MaYiListHtmlBean;
import org.ssgroup.spider.htmlBean.domain.MaYiRoom;
import org.ssgroup.spider.htmlBean.domain.list.CarOrAirport;
import org.ssgroup.spider.htmlBean.domain.list.Hospital;
import org.ssgroup.spider.htmlBean.domain.list.Office;
import org.ssgroup.spider.htmlBean.domain.list.OfficeAreas;
import org.ssgroup.spider.htmlBean.domain.list.Offices;
import org.ssgroup.spider.htmlBean.domain.list.ScenicArea;
import org.ssgroup.spider.htmlBean.domain.list.School;
import org.ssgroup.spider.htmlBean.domain.list.ShopLoops;
import org.ssgroup.spider.htmlBean.domain.list.Station;
import org.ssgroup.spider.htmlBean.domain.list.SubWayLine;
import org.ssgroup.spider.htmlBean.domain.list.SubWayStation;
import org.ssgroup.spider.htmlBean.domain.list.Vehicle;
import org.ssgroup.spider.utils.JdbcUtils;
import org.ssgroup.spider.utils.PinYinUtils;import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;@PipelineName("maYiListPipeline")
public class MaYiListPipeline implements Pipeline<MaYiListHtmlBean>{public void process(MaYiListHtmlBean bean) {try {List<MaYiRoom> rooms = bean.getRoom();if(null!=rooms && rooms.size()>0) {String cityPinYin = bean.getCity();//保存位置类型saveLocation(bean, cityPinYin);//保存房源saveRooms(rooms,cityPinYin);//继续抓取分页数据String pageStr = StringUtils.isNotBlank(bean.getPage())?bean.getPage():"1";int page = Integer.parseInt(pageStr) + 1;int pageCount = bean.getPages().get(0).getPage();
System.out.println("城市拼音【"+bean.getCity()+"】,第【"+pageStr+"】次分页抓取,总分页数【"+pageCount+"】,每一次抓取数量【"+bean.getRoom().size()+"】");if(page>0 && page<=pageCount) {HttpRequest request = bean.getRequest();String nextUrl = request.getUrl();if(StringUtils.isNoneBlank(nextUrl)) {String baseUrl = StringUtils.substringBeforeLast(nextUrl, "/");nextUrl = baseUrl+"/"+page;
System.out.println("分页请求地址【"+nextUrl+"】");SchedulerContext.into(request.subRequest(nextUrl));}}//如果最后一页抓取完成,执行下一个城市if(page==pageCount) {MaYiAllCityConstant.on=true;}}} catch (Exception e) {e.printStackTrace();}}/*** 保存房源数据* @param rooms* @param city* @throws SQLException*/private void saveRooms(List<MaYiRoom> rooms,String city) throws SQLException{Connection conn = null;PreparedStatement pstmt = null;try {conn = JdbcUtils.getConnection();conn.setAutoCommit(false);String sql = "INSERT INTO rooms(id,price,house_location,original_url,image_url,title,num_room,num_house,city,city_id) " + "VALUES(?,?,?,?,?,?,?,?,?,?)";pstmt = conn.prepareStatement(sql);for(MaYiRoom room : rooms) {pstmt.setLong(1, room.getRoomId());pstmt.setFloat(2, room.getPrice());pstmt.setString(3, room.getPosition());pstmt.setString(4, room.getOriginalImageUrl());pstmt.setString(5, room.getImageUrl());pstmt.setString(6, room.getTitle());pstmt.setString(7, room.getRooms());pstmt.setString(8, room.getHousing());pstmt.setString(9, city);pstmt.setLong(10, Application.CITY_CACHE.get(city).getId());pstmt.addBatch();}pstmt.executeBatch();conn.commit();}catch (Exception e) {e.printStackTrace();conn.rollback();}finally {JdbcUtils.close(conn, pstmt, null);}}private void saveLocation(MaYiListHtmlBean bean,String city) throws Exception{Connection conn = null;PreparedStatement pstmt = null;ResultSet resultSet = null;String sql = "INSERT INTO city_location(name,pin_yin,city_id,city_name,city_pin_yin,parent_id,href) " + "VALUES(?,?,?,?,?,?,?)";try {conn = JdbcUtils.getConnection();//"商圈","景点 ","行政区 ","车站机场 ","地铁"," 学校"," 医院"String[] cityLocation = MaYiAllCityConstant.CITY_LOCATION;for(int i=1;i<=cityLocation.length;i++) {String location = cityLocation[i-1];conn.setAutoCommit(false);pstmt = conn.prepareStatement(sql,Statement.RETURN_GENERATED_KEYS);//城市IDString tmpLocaltion = PinYinUtils.convertLower(location);Long city_id = Application.CITY_LOCATION_CACHE.get(tmpLocaltion).getId();switch (i) {case 1:  //商圈List<ShopLoops> shopLoop = bean.getShopLoops();if(null!=shopLoop && shopLoop.size()>0) {for(ShopLoops sl : shopLoop) {String name = sl.getLocation();addBatch(pstmt, name, city, city_id, sl.getHref());}}break;case 2: //景点List<ScenicArea> scenicAreas = bean.getScenicAreas();if(null!=scenicAreas && scenicAreas.size()>0) {for(ScenicArea scenicArea : scenicAreas) {addBatch(pstmt, scenicArea.getScenic(), city, city_id, scenicArea.getHref());}}break;case 3:   //行政区List<OfficeAreas> officeAreas = bean.getOfficeAreas();if(null!=officeAreas && officeAreas.size()>0) {for(OfficeAreas officeArea : officeAreas) {addBatch(pstmt, officeArea.getOfficeAreas(), city, city_id, null);}}break;case 4:   //车站机场 List<CarOrAirport> carOrAirport = bean.getCarOrAirport();if(null!=carOrAirport && carOrAirport.size()>0) {for(CarOrAirport ca : carOrAirport) {addBatch(pstmt, ca.getName(), city, city_id, null);}}break;case 5: //地铁List<SubWayLine> subWayLine = bean.getSubWayLine();if(null!=subWayLine && subWayLine.size()>0) {for(SubWayLine swl : subWayLine) {addBatch(pstmt, swl.getName(), city, city_id, null);}}break;case 6:    //学校List<School> school = bean.getSchool();if(null!=school && school.size()>0) {for(School s : school) {addBatch(pstmt, s.getName(), city, city_id, s.getHref());}}break;case 7: //医院List<Hospital> hospital = bean.getHospital();if(null!=hospital && hospital.size()>0) {for(Hospital h : hospital) {addBatch(pstmt, h.getName(), city, city_id, h.getHref());}}break;}pstmt.executeBatch();conn.commit();//添加子节点if(i==3||i==4||i==5) {//获取结果  自增IDResultSet rs = pstmt.getGeneratedKeys(); List<Long> list = new ArrayList<Long>();   while(rs.next()) {  list.add(rs.getLong(1));//取得ID  }conn.setAutoCommit(false);pstmt = conn.prepareStatement(sql,Statement.RETURN_GENERATED_KEYS);switch (i) {case 3:List<Offices> offices = bean.getOffices();if(null!=offices && offices.size()>0) {for(int j=0;j<list.size();j++) {Offices os = offices.get(0);for(Office office : os.getOffice()) {addBatch(pstmt, office.getOffice(), city, list.get(j), office.getHref());}}}break;case 4:List<CarOrAirport> cas = bean.getCarOrAirport();if(null!=cas && cas.size()>0) {for(int j=0;j<list.size();j++) {List<Vehicle> vehicles = cas.get(j).getVehicles();if(null!=vehicles && vehicles.size()>0) {for(Vehicle vehicle : vehicles) {addBatch(pstmt, vehicle.getVehicle(), city, list.get(j), vehicle.getHref());}}}}break;case 5:List<SubWayStation> subWayStation = bean.getSubWayStation();if(null!=subWayStation && subWayStation.size()>0) {for(int j=0;j<list.size();j++) {List<Station> stations = subWayStation.get(j).getStation();for(Station station : stations) {addBatch(pstmt, station.getName(), city, list.get(j), station.getHref());}}}break;}pstmt.executeBatch();conn.commit();}}}catch (Exception e) {e.printStackTrace();conn.rollback();}finally {JdbcUtils.close(conn, pstmt, resultSet);}}private void addBatch(PreparedStatement pstmt,String name,String city,Long city_id,String href) throws SQLException {pstmt.setString(1, name);pstmt.setString(2, PinYinUtils.convertLower(name));pstmt.setLong(3, Application.CITY_CACHE.get(city).getId());pstmt.setString(4, Application.CITY_CACHE.get(city).getName());//pstmt.setString(5, PinYinUtils.convertLower(Application.CITY_CACHE.get(city).getName()));pstmt.setString(5, city);pstmt.setLong(6, city_id);pstmt.setString(7, href);pstmt.addBatch();}
}

4.数据库链接工具类使用的JDBC:

package org.ssgroup.spider.utils;import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;public class JdbcUtils {private static final String USERNAME = "root";private static final String PASSWORD = "root";private static final String DRIVER = "com.mysql.jdbc.Driver";private static final String URL = "jdbc:mysql://192.168.8.110:3306/test";static {try {Class.forName(DRIVER);System.out.println("数据库连接成功!");} catch (Exception e) {e.printStackTrace();}}public static Connection getConnection() throws SQLException {return DriverManager.getConnection(URL, USERNAME, PASSWORD);}public static void close(Connection connection,PreparedStatement pstmt,ResultSet resultSet) throws SQLException {if(null!=resultSet) resultSet.close();if(null!=pstmt) pstmt.close();if(null!=connection) connection.close();}
}

5.拼音转换工具类:

package org.ssgroup.spider.utils;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.commons.lang3.StringUtils;/*** 针对中文转化拼音处理*/
public class PinYinUtils {/*** 转为大写字母, 如:中国人民银行 =====>ZHONGGUORENMINYINHAN* @author lance* 2016年1月16日 下午4:56:07*/public static String convertUpper(String text){return convert(text, HanyuPinyinCaseType.UPPERCASE, false);}/*** 转为小写字母, 如:中国人民银行 =====>zhongguorenminyinhang* @author lance* 2016年1月16日 下午4:56:07*/public static String convertLower(String text){return convert(text, HanyuPinyinCaseType.LOWERCASE, false);}/*** 首字母大写, 如:中国人民银行 =====>ZhongGuoRenMinYinHang* @author lance* 2016年1月16日 下午5:04:11*/public static String converCapitalize(String text){return convert(text, null, true);}/*** 所有中文的第一个字母大写, 如:中国人民银行 =====>ZGRMYH* @author lance* 2016年1月17日 下午10:16:19*/public static String capitalizeLetter(String text){String c = converCapitalize(text);if(StringUtils.isBlank(c)) {return "";}return StringUtils.replacePattern(c, "[a-z]", "");}/*** 获取首字母, 如:中国人民银行 =====>Z* @author lance* 2016年1月17日 下午10:11:57*/public static String firstLetter(String text){String c = converCapitalize(text);if(StringUtils.isBlank(c)) {return "";}return StringUtils.substring(c, 0, 1);}/*** 转为拼音* @param text          待转化的中文字符* @param caseType      转化类型, 即大写小写* @param isCapitalize  是否首字母大写* @author lance* 2016年1月17日 下午10:28:05*/public static String convert(String text, HanyuPinyinCaseType caseType, boolean isCapitalize) {if(StringUtils.isBlank(text)){return "";}HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();if(caseType != null) {format.setCaseType(caseType);isCapitalize = false;}format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);format.setVCharType(HanyuPinyinVCharType.WITH_V);char[] input = StringUtils.trimToEmpty(text).toCharArray();StringBuilder builder = new StringBuilder();try {for (char c: input) {if (Character.toString(c).matches("[\\u4E00-\\u9FA5]+")) {String[] temp = PinyinHelper.toHanyuPinyinStringArray(c, format);if(isCapitalize) {builder.append(StringUtils.capitalize(temp[0]));}else {builder.append(temp[0]);}} else {if(isCapitalize) {builder.append(StringUtils.capitalize(Character.toString(c)));}else {builder.append(Character.toString(c));}}}} catch (BadHanyuPinyinOutputFormatCombination ex) {ex.printStackTrace();}return builder.toString();}
}

6.启动类:

package org.ssgroup.spider;import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;import org.ssgroup.spider.constant.MaYiAllCityConstant;
import org.ssgroup.spider.domain.City;
import org.ssgroup.spider.domain.CityLocation;
import org.ssgroup.spider.utils.JdbcUtils;
import org.ssgroup.spider.utils.PinYinUtils;import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.request.HttpGetRequest;public class Application {public static Map<String,City> CITY_CACHE = new HashMap<String,City>();public static Map<String,CityLocation> CITY_LOCATION_CACHE = new HashMap<String,CityLocation>();public static String URL = "http://www.mayi.com";public static void main(String[] args) throws Exception {//初始化城市数据//initCity();//加载城市loadCityToMap();//初始化城市位置类型//initCityLocation();//加载城市位置类型loadCityLocationToMap();//先获取分类列表init();}private static void init() {String[] cityNames = new String[] {"ABCD","EFGH","JKLM","NPQR","STW","XYZ"};Map<String,String[]> allCitys = MaYiAllCityConstant.ALL_CITY;int i = 0;while(true) {if(MaYiAllCityConstant.on) {MaYiAllCityConstant.on=false;String cityName = cityNames[0];String[] citys = allCitys.get(cityName);for(String city : citys) {String city_pin_yin = PinYinUtils.convertLower(city);String nextUrl = URL+"/"+city_pin_yin;
System.out.println("MaYiIndexPipeline-->"+nextUrl);startGecco(nextUrl);}i++;}if(i == cityNames.length-1) {break;}}}private static void startGecco(String url) {HttpGetRequest start = new HttpGetRequest(url);start.setCharset("UTF-8");GeccoEngine.create().classpath("org.ssgroup.spider")//开始抓取的页面地址.start(start)//开启几个爬虫线程.thread(1)//.debug(true)//单个爬虫每次抓取完一个请求后的间隔时间.interval(5000).run();}private static void initCityLocation() throws Exception {Connection conn = null;PreparedStatement pstmt = null;try {String[] locations = MaYiAllCityConstant.CITY_LOCATION;conn = JdbcUtils.getConnection();conn.setAutoCommit(false);String sql = "INSERT INTO city_location(name,pin_yin,parent_id) " + "VALUES(?,?,?)";pstmt = conn.prepareStatement(sql);for(String location : locations) {pstmt.setString(1, location);pstmt.setString(2, PinYinUtils.convertLower(location));pstmt.setInt(3, 0);pstmt.addBatch();}pstmt.executeBatch();conn.commit();}catch (Exception e) {e.printStackTrace();conn.rollback();}finally {JdbcUtils.close(conn, pstmt, null);}}private static void initCity() throws Exception {Connection conn = null;PreparedStatement pstmt = null;try {Map<String,String[]> allCitys = MaYiAllCityConstant.ALL_CITY;for(Entry<String,String[]> entry : allCitys.entrySet()) {if(!"holdCity".equals(entry.getKey())){String[] citys = entry.getValue();//保存城市数据conn = JdbcUtils.getConnection();conn.setAutoCommit(false);String sql = "INSERT INTO city(name,pin_yin,first_pin_yin,first_last_pin_yin) " + "VALUES(?,?,?,?)";pstmt = conn.prepareStatement(sql);for(String city : citys) {pstmt.setString(1, city);pstmt.setString(2, PinYinUtils.convertLower(city));pstmt.setString(3, PinYinUtils.firstLetter(city).toLowerCase());pstmt.setString(4, PinYinUtils.capitalizeLetter(city).toLowerCase());pstmt.addBatch();}pstmt.executeBatch();conn.commit();}}}catch (Exception e) {e.printStackTrace();conn.rollback();}finally {JdbcUtils.close(conn, pstmt, null);}}private static void loadCityToMap() throws Exception {Connection conn = null;PreparedStatement pstmt = null;ResultSet resultSet = null;try {conn = JdbcUtils.getConnection();String sql = "SELECT id,name,pin_yin,first_pin_yin,first_last_pin_yin FROM city";pstmt = conn.prepareStatement(sql);resultSet = pstmt.executeQuery();while(resultSet.next()) {Long id = resultSet.getLong("id");String name = resultSet.getString("name");String pinYin = resultSet.getString("pin_yin");String firstPinYin = resultSet.getString("first_pin_yin");String firstLastPinYin = resultSet.getString("first_last_pin_yin");City city = new City();city.setId(id);city.setName(name);city.setPinYin(pinYin);city.setFirstPinYin(firstPinYin);city.setFirstLastPinYin(firstLastPinYin);CITY_CACHE.put(pinYin, city);}}catch (Exception e) {e.printStackTrace();}finally {JdbcUtils.close(conn, pstmt, resultSet);}}private static void loadCityLocationToMap() throws Exception {Connection conn = null;PreparedStatement pstmt = null;ResultSet resultSet = null;try {conn = JdbcUtils.getConnection();String sql = "SELECT id,name,pin_yin FROM city_location where parent_id=0";pstmt = conn.prepareStatement(sql);resultSet = pstmt.executeQuery();while(resultSet.next()) {Long id = resultSet.getLong("id");String name = resultSet.getString("name");String pinYin = resultSet.getString("pin_yin");CityLocation cityLocation = new CityLocation();cityLocation.setId(id);cityLocation.setName(name);cityLocation.setPinYin(pinYin);CITY_LOCATION_CACHE.put(pinYin, cityLocation);}}catch (Exception e) {e.printStackTrace();}finally {JdbcUtils.close(conn, pstmt, resultSet);}}
}

7.SQL

CREATE TABLE `city` (`id` bigint(10) NOT NULL AUTO_INCREMENT,`name` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市名称',`pin_yin` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市拼音',`first_pin_yin` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '首字母简写',`first_last_pin_yin` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '首尾字母简写',PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='城市表';CREATE TABLE `city_location` (`id` bigint(10) NOT NULL AUTO_INCREMENT,`name` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市地段名称',`pin_yin` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市地段拼音',`city_id` bigint(10) DEFAULT NULL COMMENT '城市ID',`city_name` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '城市名称',`city_pin_yin` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '城市拼音',`parent_id` bigint(10) DEFAULT NULL COMMENT '父ID',`status` int(2) DEFAULT '0' COMMENT '城市拼音',`href` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '请求路径',PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='城市商业地段表';CREATE TABLE `rooms` (`id` bigint(10) NOT NULL,`price` DOUBLE(10,2) DEFAULT NULL COMMENT '价格',`house_location` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '房源地理位置',`original_url` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '图片原始地址',`image_url` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '缩略图地址',`title` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '房源标题',`num_room` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '几居室',`num_house` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '可住几个人',`city` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '属于哪个城市',`city_id` bigint(10) NOT NULL,PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='房源表';

利用Gecco爬取(蚂蚁短租网)列表页数据相关推荐

  1. [python爬虫] BeautifulSoup设置Cookie解决网站拦截并爬取蚂蚁短租

    我们在编写Python爬虫时,有时会遇到网站拒绝访问等反爬手段,比如这么我们想爬取蚂蚁短租数据,它则会提示"当前访问疑似黑客攻击,已被网站管理员设置为拦截"提示,如下图所示.此时我 ...

  2. python爬虫cookie池 与ip绑定_Python爬虫:设置Cookie解决网站拦截并爬取蚂蚁短租

    前言 文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: Eastmount PS:如有需要Python学习资料的小伙伴可以加 ...

  3. Python爬虫:设置Cookie解决网站拦截并爬取蚂蚁短租

    我们在编写Python爬虫时,有时会遇到网站拒绝访问等反爬手段,比如这么我们想爬取蚂蚁短租数据,它则会提示"当前访问疑似黑客攻击,已被网站管理员设置为拦截"提示,如下图所示.此时我 ...

  4. 用pyton爬取某短租网信息

    import requests #用于向网站服务器发起请求 from bs4 import BeautifulSoup #用于处理服务反馈回来的网页文件 import pymongo #用于连接Mon ...

  5. 疫情过去女朋友想去重庆玩,python批量爬取小猪短租重庆民宿信息

    疫情过去女朋友想去重庆玩,python批量爬取小猪短租重庆民宿信息 随着时间的流逝,在中国共产党的领导,全国人民的共同努力下,疫情逐渐受到了控制,逐渐好转,复工,开学有望.最近在和女朋友的闲聊当中得知 ...

  6. Python爬虫入门 | 5 爬取小猪短租租房信息

    小猪短租是一个租房网站,上面有很多优质的民宿出租信息,下面我们以成都地区的租房信息为例,来尝试爬取这些数据. 小猪短租(成都)页面:http://cd.xiaozhu.com/   1.爬取租房标题 ...

  7. 如何利用 C# 爬取「中国图书网 - 计算机与互联网图书销量榜」!

    每周一,我们会爬取「京东」.「当当」.「互动出版网」和「中国图书网」的图书7日销量数据并做一份榜单,已经持续一段时间了,从后台阅读量的统计数据来看,对辅助大家选购计算机类书籍还是有帮助的.如果大家对这 ...

  8. python3爬取巨潮资讯网的年报数据

    python3爬取巨潮资讯网的年报数据 前期准备: 需要用到的库: 完整代码: 前期准备: 巨潮资讯网有反爬虫机制,所以先打开巨潮资讯网的年报板块,看看有什么解决办法. 巨潮咨询年报板块 可以通过这样 ...

  9. 利用Python爬取《囧妈》豆瓣短评数据,并进行snownlp情感分析

    利用Python爬取<囧妈>豆瓣短评数据,并进行snownlp情感分析 一.电影评论爬取 今年的贺岁片<囧妈>上映前后,在豆瓣评论上就有不少网友发表了自己的观点,到底是好评的声 ...

  10. python爬取网上租房信息_Python爬虫入门 | 5 爬取小猪短租租房信息

    小猪短租是一个租房网站,上面有很多优质的民宿出租信息,下面我们以成都地区的租房信息为例,来尝试爬取这些数据. 1.爬取租房标题 按照惯例,先来爬下标题试试水,找到标题,复制xpath. 多复制几个房屋 ...

最新文章

  1. 从零打造聚合支付系统:一、浅谈聚合支付的核心价值
  2. 超级简单的mysql主从数据库配置攻略以及错误处理
  3. [Qt教程] 第21篇 数据库(一)Qt数据库应用简介
  4. leetcode990. 等式方程的可满足性(并查集)
  5. Pytorch:PIL 和CV
  6. [转载] arrayproxy转numpy_Python numpy.ptp() 使用实例
  7. java的finalize方法使用
  8. SSM环境+jquery+ajax 实现批量文件上传并预览后,同时上传文件和数据 校验图片后缀是否合法 文件大小是否超限
  9. yaml 文件格式语法
  10. python自动化要学全部基础知识_FishC工作室《零基础学习python》全套课后题
  11. python归一化函数_机器学习-归一化方法
  12. 正交幅度调制(QAM)
  13. 【Micro USB选择指南】手工焊接Micro USB接口器件型号选择
  14. 读书有益——》《断舍离·舍·做减法的勇气》
  15. 坐标计算距离公式 火星坐标系_根据经纬度计算距离的公式、百度坐标转换成GPS坐标(PHP版)...
  16. logout命令详解
  17. 学习计划【硬件课程设计】【课设】
  18. 关于WIN10系统无法打开CHM文件
  19. 120年奥运史:运动员和成绩(相关数据集)
  20. 必须了解的五个服务器基础问题

热门文章

  1. Linux内存管理(二):ARMv8 地址转换
  2. 带管理职位面试中遇到的常见经典问题的回答
  3. PHP消息队列的实现方式
  4. 极客时间java高级(第二次课程)
  5. mysql 查询所有表结构_mysql数据库查看表结构
  6. 网络安全——网络空间搜索引擎
  7. Typescript的应用与思考
  8. 解决PMML namespace URI httpwww.dmg.orgPMML-4_4 is not supported
  9. Could not resolve project
  10. 【树莓派】树莓派4B新手篇:安装官网Raspbian Buster系统及基础配置