python简单爬虫手机号_python手机号前7位归属地爬虫代码实例

需求分析

项目上需要用到手机号前7位，判断号码是否合法，还有归属地查询。旧的数据是几年前了太久了，打算用python爬虫重新爬一份

单线程版本

# coding:utf-8

import requests

from datetime import datetime

class PhoneInfoSpider:

def __init__(self, phoneSections):

self.phoneSections = phoneSections

def phoneInfoHandler(self, textData):

text = textData.splitlines(True)

# print("text length:" + str(len(text)))

if len(text) >= 9:

number = text[1].split('\'')[1]

province = text[2].split('\'')[1]

mobile_area = text[3].split('\'')[1]

postcode = text[5].split('\'')[1]

line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode

line_text = number + "," + province + "," + mobile_area + "," + postcode

print(line_text)

# print("province:" + province)

try:

f = open('./result.txt', 'a')

f.write(str(line_text) + '\n')

except Exception as e:

print(Exception, ":", e)

def requestPhoneInfo(self, phoneNum):

try:

url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum

response = requests.get(url)

self.phoneInfoHandler(response.text)

except Exception as e:

print(Exception, ":", e)

def requestAllSections(self):

# last用于接上次异常退出前的号码

last = 0

# last = 4

# 自动生成手机号码，后四位补0

for head in self.phoneSections:

head_begin = datetime.now()

print(head + " begin time:" + str(head_begin))

# for i in range(last, 10000):

for i in range(last, 10):

middle = str(i).zfill(4)

phoneNum = head + middle + "0000"

self.requestPhoneInfo(phoneNum)

last = 0

head_end = datetime.now()

print(head + " end time:" + str(head_end))

if __name__ == '__main__':

task_begin = datetime.now()

print("phone check begin time:" + str(task_begin))

# 电信，联通，移动，虚拟运营商

dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']

lt = ['130', '131', '132', '145', '146', '155', '156', '166', '171', '175', '176', '185', '186', '166']

yd = ['134', '135', '136', '137', '138', '139', '147', '148', '150', '151', '152', '157', '158', '159', '172',

'178', '182', '183', '184', '187', '188', '198']

add = ['170']

all_num = dx + lt + yd + add

# print(all_num)

print(len(all_num))

# 要爬的号码段

spider = PhoneInfoSpider(all_num)

spider.requestAllSections()

task_end = datetime.now()

print("phone check end time:" + str(task_end))

发现爬取一个号段，共10000次查询，单线程版大概要多1个半小时，太慢了。

多线程版本

# coding:utf-8

import requests

from datetime import datetime

import queue

import threading

threadNum = 32

class MyThread(threading.Thread):

def __init__(self, func):

threading.Thread.__init__(self)

self.func = func

def run(self):

self.func()

def requestPhoneInfo():

global lock

while True:

lock.acquire()

if q.qsize() != 0:

print("queue size:" + str(q.qsize()))

p = q.get() # 获得任务

lock.release()

middle = str(9999 - q.qsize()).zfill(4)

phoneNum = phone_head + middle + "0000"

print("phoneNum:" + phoneNum)

try:

url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum

# print(url)

response = requests.get(url)

# print(response.text)

phoneInfoHandler(response.text)

except Exception as e:

print(Exception, ":", e)

else:

lock.release()

break

def phoneInfoHandler(textData):

text = textData.splitlines(True)

if len(text) >= 9:

number = text[1].split('\'')[1]

province = text[2].split('\'')[1]

mobile_area = text[3].split('\'')[1]

postcode = text[5].split('\'')[1]

line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode

line_text = number + "," + province + "," + mobile_area + "," + postcode

print(line_text)

# print("province:" + province)

try:

f = open('./result.txt', 'a')

f.write(str(line_text) + '\n')

except Exception as e:

print(Exception, ":", e)

if __name__ == '__main__':

task_begin = datetime.now()

print("phone check begin time:" + str(task_begin))

dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']

lt = ['130', '131', '132', '145', '155', '156', '166', '171', '175', '176', '185', '186', '166']

yd = ['134', '135', '136', '137', '138', '139', '147', '150', '151', '152', '157', '158', '159', '172', '178',

'182', '183', '184', '187', '188', '198']

all_num = dx + lt + yd

print(len(all_num))

for head in all_num:

head_begin = datetime.now()

print(head + " begin time:" + str(head_begin))

q = queue.Queue()

threads = []

lock = threading.Lock()

for p in range(10000):

q.put(p + 1)

print(q.qsize())

for i in range(threadNum):

middle = str(i).zfill(4)

global phone_head

phone_head = head

thread = MyThread(requestPhoneInfo)

thread.start()

threads.append(thread)

for thread in threads:

thread.join()

head_end = datetime.now()

print(head + " end time:" + str(head_end))

task_end = datetime.now()

print("phone check end time:" + str(task_end))

多线程版的1个号码段1000条数据，大概2，3min就好，cpu使用飙升，大概维持在70%左右。

总共40多个号段，爬完大概1，2个小时，总数据41w左右

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持我们。

本文标题: python手机号前7位归属地爬虫代码实例

本文地址: http://www.cppcns.com/jiaoben/python/304740.html

python简单爬虫手机号_python手机号前7位归属地爬虫代码实例相关推荐

python 爬手机号_python手机号前7位归属地爬虫代码实例
需求分析项目上需要用到手机号前7位,判断号码是否合法,还有归属地查询.旧的数据是几年前了太久了,打算用python爬虫重新爬一份单线程版本 # coding:utf-8 import reques ...
python手机号定位_python手机号前7位归属地爬虫
需求分析项目上需要用到手机号前7位,判断号码是否合法,还有归属地查询.旧的数据是几年前了太久了,打算用python爬虫重新爬一份单线程版本 # coding:utf-8 import reques ...
python控制软件点击_Python小程序控制鼠标循环点击代码实例
Python小程序控制鼠标循环点击代码实例这篇文章主要介绍了Python小程序控制鼠标循环点击代码实例,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以 ...
python list join函数_Python中join（）函数多种操作代码实例
这篇文章主要介绍了Python中join()函数多种操作代码实例,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下 Python中有.join()和os ...
python爬取多页数据_python爬虫实现爬取同一个网站的多页数据代码实例
本篇文章小编给大家分享一下python爬虫实现爬取同一个网站的多页数据代码实例,文章代码介绍的很详细,小编觉得挺不错的,现在分享给大家供大家参考,有需要的小伙伴们可以来看看. 一.爬虫的目的从网上获 ...
在当当买了python怎么下载源代码-Python爬取当当、京东、亚马逊图书信息代码实例...
注:1.本程序采用MSSQLserver数据库存储,请运行程序前手动修改程序开头处的数据库链接信息 2.需要bs4.requests.pymssql库支持 3.支持多线程 from bs4 impor ...
python 爬虫框架_Python常用的几个高效率的爬虫框架
1.Scrapy Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架. 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中.用这个框架可以轻松爬下来如亚马逊商品信息之类的数 ...
python简单的预测模型_python简单预测模型
python简单预测模型步骤1:导入所需的库,读取测试和训练数据集. #导入pandas.numpy包,导入LabelEncoder.random.RandomForestClassifier.Gr ...
python爬取美女_Python爬取高颜值美女（爬虫+人脸检测+颜值检测）附学习教程
1 数据源知乎话题『美女』下所有问题中回答所出现的图片 2 抓取工具 Python 3,并使用第三方库 Requests.lxml.AipFace,代码共 100 + 行 3 必要环境Mac / L ...

python简单爬虫手机号_python手机号前7位归属地爬虫代码实例

python简单爬虫手机号_python手机号前7位归属地爬虫代码实例相关推荐

最新文章

热门文章