python实现目录中制定内容查找

#【函数实现代码】------------------------------------------------------------------------------------------------------------------------------------------------------------------

#_*_coding=utf-8_*___author__ = 'fang'__date__ = '2019/2/25 9:58'

import os,codecs,chardetfrom time import timeprint(__file__)thistime = time()

def endWith(s, *endstring):    """    检查文件名称是否以endstring为结尾    :param s:     :param endstring:     :return:     """    array = map(s.endswith, endstring)    if True in array:        return True    else:        return False

def writeResultLog(allExistsKeywords):    """    获取查询结果的内容    将全部已搜索到的关键字列表中的内容保存到result.log文件中    :param allExistsKeywords:     :return:     """

    __logfilename = "result.log"  # 相对路径,文件在.py文件所在的目录中    # 行分隔符    ls = os.linesep    # 结果日志文件名    try:        fobj = open(__logfilename, 'w',errors='ignore')    except IOError as e:        print("*** file open error:", e)

    else:        # print(allExistsKeywords)        fobj.writelines(['%s%s' % (keyword, ls) for keyword in allExistsKeywords])        fobj.close()#判断文件的编码方式是否是utf8格式文件，是返回True否则False# def existBOM(file_obj):#     code = file_obj.read(3)#     file_obj.close()#     if code == codecs.BOM_UTF8:  # 判断是否包含EF BB BF#         return True   #如果要去掉头部信息的话s = s[len(codecs.BOM_UTF8):]#     return False

def searchFilesContent(url):    """    从searchkeywords.txt文件中初始化待搜索关键字列表    :param dirname:     :return:     """    filename = "searchkeywords.txt"  # 相对路径,文件在.py文件所在的目录中,搜索关键字的文件    # 待搜索关键字列表    allSearchKeywords = []    # 遍历文件当前行已搜索到的关键字列表    existsKeywordsThisLine = []    allExistsKeywords = []    #放置所有搜索的文件    thistime = time()    try:        fobj = open(filename, 'r')    except IOError as e:        print("*** file open error:", e)    else:        for eachLine in fobj:            allSearchKeywords.append(eachLine.strip('\n'))  # 使用strip函数去除每行的换行符        fobj.close()

    # 从excludekeywords.txt文件中初始化要排除的搜索关键字列表    filename = "excludekeywords.txt"  # 相对路径,文件在.py文件所在的目录中    # 要排除的搜索关键字列表    allExcludedKeywords = []    try:        fobj = open(filename, 'r')    except IOError as e:        print("*** file open error:", e)

    else:        for eachLine in fobj:            allExcludedKeywords.append(eachLine.strip('\n'))  # 使用strip函数去除每行的换行符        fobj.close()

    # 从全部已搜索到的关键字列表排除掉不用搜索的关键字    for excluedkw in allExcludedKeywords:        if (excluedkw in allSearchKeywords):            allSearchKeywords.remove(excluedkw)    # 遍历打开所有要在其中搜索内容的文件，若待搜索关键字列表为空，则不再继续遍历    for root, dirs, files in os.walk(url):        for file in files:            if endWith(file, '.txt', '.py'):  # 只在扩展名为'.txt', '.py'文件中搜索                # 打开文件                filename = root + os.sep + file  # 绝对路径                filename = filename.replace("\\","\\\\")  # 将路径中的单反斜杠替换为双反斜杠，因为单反斜杠可能会导致将路径中的内容进行转义了，replace函数中"\\"表示单反斜杠，"\\\\"表示双反斜杠                try:                    # ==========开始读取目录中的文件遍历查找的过程                    fobj = codecs.open(filename, 'r', 'utf_8_sig', errors='ignore')                except IOError as e:                    print("*** file open error:", e)                else:                    # 遍历文件的每一行                    allSearchKeywords_1 = allSearchKeywords                    for fileLine in fobj:                        # 判断当前行是否包含所有搜索关键字                        for keyword in allSearchKeywords:                            # 若包含，并添加到该行已搜索到的关键字列表中                            if keyword.upper() in fileLine.upper():  # 将搜索关键字和该行文本内容都转换为大写后再进行匹配                                existsKeywordsThisLine.append(keyword)

                        # 将这些搜索到的关键字添加到全部已搜索到的关键字列表中，并包含文件名信息                        for keyword in existsKeywordsThisLine:                            allExistsKeywords.append(keyword + "\t" + filename.replace("\\\\", "\\"))                        if allSearchKeywords is None:                            existsKeywordsThisLine = []                            break                        # 清空该行已搜索到的关键字列表内容                        existsKeywordsThisLine = []                    allSearchKeywords = allSearchKeywords_1                    fobj.close()                    # 全部文件遍历结束    writeResultLog(allExistsKeywords)    print("DONE!", )# 仅当本python模块直接执行时，才执行如下语句，若被别的python模块引入，则不执行if __name__ == '__main__':    url = r"E:\python_data"    searchFilesContent(url)    search_time = time() - thistime    print('The code run {:.0f}m {:.0f}s'.format(search_time // 60, search_time % 60))

#【类实现代码】------------------------------------------------------------------------------------------------------------------------------------------------------------------

#_*_coding=utf-8_*_
__author__ = 'fang'
__date__ = '2019/2/25 9:58'

import os,codecs,chardet
from multiprocessing import Process,Queue, Lock,current_process
from time import time
print(__file__)
class File_Search(object):
    def __init__(self, url):
        """初始化"""
        self.__url = url

def endWith(self, s, *endstring):
        """
        检查文件名称是否以endstring为结尾
        :param s:
        :param endstring:
        :return:
        """
        array = map(s.endswith, endstring)
        if True in array:
            return True
        else:
            return False

def proc_read(self):
        allExistsKeywords = []
        while True:
            try:
                data = self.q.get()
                allExistsKeywords.extend(data)
            except:
                print("get读取查询到的数据结束，数据是",allExistsKeywords)
                self.writeResultLog(allExistsKeywords)
                break
        return time() - self.thistime

def writeResultLog(self):
        """
        获取查询结果的内容
        将全部已搜索到的关键字列表中的内容保存到result.log文件中
        :param allExistsKeywords:
        :return:
        """
        allExistsKeywords = []
        while True:
            try:
                data = self.q.get(block=False)
                allExistsKeywords.extend(data)
            except :
                print("差最后一步保存就可以了....")
                break
        self.__logfilename = "result.log" # 相对路径,文件在.py文件所在的目录中
        # 行分隔符
        ls = os.linesep
        # 结果日志文件名
        try:
            fobj = open(self.__logfilename, 'w',errors='ignore')
        except IOError as e:
            print("*** file open error:", e)

else:
            # print(allExistsKeywords)
            fobj.writelines(['%s%s' % (keyword, ls) for keyword in allExistsKeywords])
            fobj.close()
        return time() - self.thistime
    #判断文件的编码方式是否是utf8格式文件，是返回True否则False
    # def existBOM(file_obj):
    #     code = file_obj.read(3)
    #     file_obj.close()
    #     if code == codecs.BOM_UTF8: # 判断是否包含EF BB BF
    #         return True   #如果要去掉头部信息的话s = s[len(codecs.BOM_UTF8):]
    #     return False

def searchFilesContent(self):
        """
        从searchkeywords.txt文件中初始化待搜索关键字列表
        :param dirname:
        :return:
        """
        self.__filename = "searchkeywords.txt" # 相对路径,文件在.py文件所在的目录中,搜索关键字的文件
        # 待搜索关键字列表
        self.__allSearchKeywords = []
        # 遍历文件当前行已搜索到的关键字列表
        self.__existsKeywordsThisLine = []
        # 全部已搜索到的关键字列表
        self.__allExistsKeywords = []
        #子进程创建Queue，并传给各个子进程
        self.q = Queue()
        #放置所有搜索的文件
        self.__filename_list = []
        #放置所有进程的列表
        self.process_list = []
        #进程间锁
        self.lock = Lock()
        self.thistime = time()
        filename_list = []
        try:
            fobj = open(self.__filename, 'r')
        except IOError as e:
            print("*** file open error:", e)
        else:
            for eachLine in fobj:
                self.__allSearchKeywords.append(eachLine.strip('\n')) # 使用strip函数去除每行的换行符
            fobj.close()

# 从excludekeywords.txt文件中初始化要排除的搜索关键字列表
        filename = "excludekeywords.txt" # 相对路径,文件在.py文件所在的目录中
        # 要排除的搜索关键字列表
        allExcludedKeywords = []
        try:
            fobj = open(filename, 'r')
        except IOError as e:
            print("*** file open error:", e)

else:
            for eachLine in fobj:
                allExcludedKeywords.append(eachLine.strip('\n')) # 使用strip函数去除每行的换行符
            fobj.close()

# 从全部已搜索到的关键字列表排除掉不用搜索的关键字
        for excluedkw in allExcludedKeywords:
            if (excluedkw in self.__allSearchKeywords):
                self.__allSearchKeywords.remove(excluedkw)
        # 遍历打开所有要在其中搜索内容的文件，若待搜索关键字列表为空，则不再继续遍历
        for root, dirs, files in os.walk(self.__url):
            for file in files:
                if self.endWith(file, '.txt', '.py'): # 只在扩展名为'.txt', '.py'文件中搜索
                    # 打开文件
                    filename = root + os.sep + file # 绝对路径
                    filename = filename.replace("\\","\\\\") # 将路径中的单反斜杠替换为双反斜杠，因为单反斜杠可能会导致将路径中的内容进行转义了，replace函数中"\\"表示单反斜杠，"\\\\"表示双反斜杠
                    # filename_list.append(filename)
                    self.pp = Process(target=file_search.run, args=(self.q, self.lock, filename))
                    self.pp.start()
                    self.process_list.append(self.pp)
        # return filename_list
    # print("DONE!", )
    def run(self, q, l, filename):
        """
        多进程搜索文件，查找并记录到进程队列中
        :param q: 进程间通信队列
        :param l: 进程间控制锁
        :param filename: 接收的文件的绝对路径
        :return:
        """
        l.acquire()
        try:
            # ==========开始读取目录中的文件遍历查找的过程
            fobj = codecs.open(filename, 'r', 'utf_8_sig', errors='ignore')
        except IOError as e:
            print("*** file open error:", e)
        else:
            # 遍历文件的每一行
            for fileLine in fobj:
                # 判断当前行是否包含所有搜索关键字
                for keyword in self.__allSearchKeywords:
                    # 若包含，并添加到该行已搜索到的关键字列表中
                    if keyword.upper() in fileLine.upper(): # 将搜索关键字和该行文本内容都转换为大写后再进行匹配
                        self.__existsKeywordsThisLine.append(keyword)

# 将这些搜索到的关键字添加到全部已搜索到的关键字列表中，并包含文件名信息
for keyword in self.__existsKeywordsThisLine:
self.__allExistsKeywords.append(keyword + "\t" + filename.replace("\\\\", "\\"))

# 清空该行已搜索到的关键字列表内容
                self.__existsKeywordsThisLine = []
            fobj.close()
            # 全部文件遍历结束
            # self.writeResultLog(self.__allExistsKeywords)
            q.put(self.__allExistsKeywords) #放置到队列中
            print('当前进程的名字是： ', current_process().name,'已放置到队列中....')
        l.release()

def __del__(self):
            """
            析构函数
            join所完成的工作就是线程同步，即主线程任务结束之后，进入阻塞状态，一直等待其他的子线程执行结束之后，主线程再终止
            :param self:
            :return:
            """
            for process in self.process_list:
                process.join()

# 仅当本python模块直接执行时，才执行如下语句，若被别的python模块引入，则不执行
if __name__ == '__main__':
    url = r"E:\python_data"
    file_search = File_Search(url)
    file_search.searchFilesContent()
    search_time = file_search.writeResultLog()
    print('The code run {:.0f}m {:.0f}s'.format(search_time // 60, search_time % 60))

以上函数实现和类实现的功能是一样的，但是类实现需要的时间相比函数实现要长很多，贴在这里期待帮助初学的我指点一二，如何进行代码优化

转载于:https://www.cnblogs.com/chaolumeng/p/10452139.html

python实现目录中制定内容查找相关推荐

pythonexcel汇总_用python汇总excel表格数据-怎样用python遍历表格中的内容
如何用python将数据写入excel表格 # 需 xlrd-0.9.2 xlutils-1.7.1 这两个模块 from xlwt import Workbook, Formula import ...
php遍历指定目录中的内容2
输出文件是否可读写,可执行,并同时输出创建时间,修改时间,访问时间 //2.遍历指定目录中的内容 if ($arr['file']) {$arrbyte = array("Byte" ...
Linux命令格式查询目录中的内容
Linux常用命令 1. [root@localhost~]# · root:代表当前登录用户,Linux当中管理员账户是root而不是administrater! · @:没有特殊含义 · ...
python显示目录中的文件_Python中的文件和目录操作实现
Python中的文件和目录操作实现对于文件和目录的处理,虽然可以通过操作系统命令来完成,但是Python语言为了便于开发人员以编程的方式处理相关工作,提供了许多处理文件和目录的内置函数.重要的是,这 ...
Python 列出目录中的文件
在本文中,我们将了解如何在 Python 中列出目录中的所有文件.有多种方法可以列出目录的文件.在本文中,我们将使用以下四种方法. os.listdir('dir_path'):返回指定目录路径中存 ...
python移动文件中某个内容_如何在Python中移动文件
如何在Python中移动文件我查看了Python $ mv ...接口,但无法找到移动文件的方法. 我如何在Python中执行相当于$ mv ...的操作? >>> source_ ...
python读取文本中的内容
1.获取路径符 pathseq = os.path.seq 2.文本中获取内容 import os import sys from collections import OrderedDictdef ...
python复制word中的内容，包括格式、图片、文字
需要安装模块,win32com 下载地址:http://download.csdn.net/detail/clean_water/9759816 # 导入模块 import win32com.clie ...
python读出文件中的内容_Python读取文本内容
综述在Python中,读文件主要分为三个步骤: 打开文件 . 读取内容 . 关闭文件 .一般形式如下: try: file = open('/path/to/file', 'r') # 打开文件 d ...

python实现目录中制定内容查找

python实现目录中制定内容查找相关推荐

最新文章

热门文章