嘿!我用python帮我干这些事

news/2024/7/19 10:18:18 标签: python, 爬虫, 大数据

  python 无疑是当下火上天的语言,但是我们又不拿来工作,那么能拿来干啥呢?我是这么干的。

1. 平时工作开发用不上,就当个计算器吧!

python
# 加减乘除
>>> (3 + 2) - 5 * 1
5
# 位运算
>>> 3 << 2
12
# x ^ y 幂次方运算,不能开方运算
>>> 3 ** 2
9
# 用另一种计算幂次方的运算,可以开方运算
>>> pow(9, 0.5)
3.0
# 作进制转换,如二进制转换,十进制转n进制
>>> bin(2)
'0b10'
>>> hex(25)
'0x19'
>>> oct(10)
'012'
>>> int('e0', 16)
224
# 将十进制转换为二进制,以全0占位形式显示二进制,更方便查看,默认为32位,使用如下图所示
def decbin(i, bit=32):
  return (bin(((1 << bit) - 1) & i)[2:]).zfill(bit)

2. 做简单爬虫
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import urllib,urllib2
import re
import os
import HTMLParser
dirbase = '/tmp'
urlbase = 'http://hg.openjdk.java.net'
url= urlbase + '/jdk8u/jdk8u/jdk/file/dddb1b026323/src'        #/jdk,/hotspot
skip_to_p = ''
skip_find = False;
textmod ={'user':'admin','password':'admin'}
textmod = urllib.urlencode(textmod)
print(url)
req = urllib2.Request(url = '%s%s%s' % (url,'?',textmod))
res = urllib2.urlopen(req)
res = res.read()
alink = re.findall(r'<a',res)
allflist = []

table=re.findall(r'<tbody class="stripes2">(.+)<\/tbody>',res, re.S)

harr = re.findall(r'href="(/jdk8u[\w\/\._]+)">(?!\[up\])', table[0])

def down_src_recursion(harr):
  global allflist,skip_find;
  if(not harr):
    return False;
  i=0; arrlen = len(harr)
  lock_conflict_jump_max = 2;   # 遇到文件锁时跳过n个文件,当前仍需跳过的文件数量
  lock_conflict_jumping = 0;
  print("in new dir cur...")
  if(len(allflist) > 1500):
     print('over 1500, cut to 50 exists...')
     allflist = allflist[-800:]
  for alink in harr:
    i += 1;
    alink = alink.rstrip('/')
    if(skip_to_p and not skip_find):
    if(alink != skip_to_p):
      print('skip file, cause no find..., skip=%s,now=%s' % (skip_to_p, alink))
      continue;
    else:
      skip_find = True;
    if(alink in allflist):
      print('目录已搜寻过:' + alink)
      continue;
    pa = dirbase + alink
    if(os.path.isfile(pa)):
      print('文件已存在,无需下载: ' + pa)
      continue;
    lockfile=pa+'.tmp'
    if(os.path.isfile(lockfile)):
    lock_conflict_jumping = lock_conflict_jump_max;
      print('文件正在下载中,跳过+%s...: %s' % (lock_conflict_jumping, lockfile))continue; 
    else:
      if(lock_conflict_jumping > 0):
         lock_conflict_jumping -= 1;
         print('文件正在下载中,跳过+%s...: %s' % (lock_conflict_jumping, lockfile))continue;
    # 首先根据后缀把下载中的标识标记好,因为网络下载时间更慢,等下载好后再加标识其实已为时已晚
    if(pa.endswith(('.gif','.jpg','.png', '.xml', '.cfg', '.properties', '.make', '.sh', '.bat', '.html', '.c','.cpp', '.h', '.hpp', '.java', '.1'))):
       os.mknod(lockfile);
    reqt = urllib2.Request(urlbase + alink)
    rest = urllib2.urlopen(reqt)
    rest = rest.read()
    allflist.append(alink)
    if(rest.find('class="sourcefirst"') > 0):
       print('这是个资源文件:%s         %d/%d' % (alink, i, arrlen))
       if(not os.path.isfile(lockfile)):
          os.mknod(lockfile);
       filename = alink.split('/')[-1]
       linearr = re.findall(r'<span id=".+">(.+)</span>', rest)
       fileObject = open(dirbase + alink, 'w')
       for line in linearr:
          try:
            line = HTMLParser.HTMLParser().unescape(line)
          except UnicodeDecodeError as e:
            print('oops, ascii convert error accour:', e)
          fileObject.write(line + '\r\n')
       fileObject.close()
       os.remove(lockfile); 
    else:
      print('这是目录:%s        %d/%d' % (alink, i, arrlen))
      if(not os.path.exists(pa)):
         print('创建目录:%s' % alink)
         os.makedirs('/tmp' + alink, mode=0777)
      ta=re.findall(r'<tbody class="stripes2">(.+)<\/tbody>',rest, re.S)
      ha = re.findall(r'href="(/jdk8u[\w\/\._]+)">(?!\[up\])', ta[0])
      down_src_recursion(ha)

# go...
down_src_recursion(harr);

以上python2 版本的爬虫,在python3中则要改编下呢!
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# for python3

import urllib.parse
import urllib.request
import re
import os
import html
dirbase = '/tmp'
urlbase = 'http://hg.openjdk.java.net'
url= urlbase + '/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/sun/misc'
#skip_to_p = '/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/sun/misc'
skip_to_p = ''
skip_find = False;
textmod ={'user':'admin','password':'admin'}
textmod = urllib.parse.urlencode(textmod)
print(url)
res = urllib.request.urlopen(url = '%s%s%s' % (url,'?',textmod))
res = res.read().decode('utf-8')
alink = re.findall(r'<a', res)
allflist = []

table=re.findall(r'<tbody class="stripes2">(.+)<\/tbody>',res, re.S)

harr = re.findall(r'href="(/jdk8u[\w\/\._]+)">(?!\[up\])', table[0])

def down_src_cur(harr):
    global allflist,skip_find;
    if(not harr):
        return False;
    i=0; 
    arrlen = len(harr);
    print("- In new dir cur...")
    if(len(allflist) > 1500):
        print('- Over 1500, cut to 50 exists...')
    allflist = allflist[-800:]
    for alink in harr:
        i += 1;
        alink = alink.rstrip('/')
        if(skip_to_p and not skip_find):
            if(alink != skip_to_p):
                print('- Skip file, cause no find..., skip=%s,now=%s' % (skip_to_p, alink))
                continue;
            else:
                skip_find = True;
        if(alink in allflist):
            print('- Searched before:' + alink)
            continue;
        rest='';
        try:    
            res = urllib.request.urlopen(urlbase + alink)
            rest = res.read().decode('utf-8')
        except Exception as e:
            print(e)
            print(" ERROR accour, continue;")
            continue;
        allflist.append(alink)
        if(rest.find('class="sourcefirst"') > 0):
            print('- Code resourse:%s         %d/%d' % (alink, i, arrlen))
            filename = alink.split('/')[-1]
            linearr = re.findall(r'<span id=".+">(.+)</span>', rest)
            fileObject = open(dirbase + alink, 'w')
            for line in linearr:
                fileObject.write(html.unescape(line) + '\r\n')
            fileObject.close()
        else:
            pa = dirbase + alink
            print('- Directory:%s        %d/%d' % (alink, i, arrlen))
            if(not os.path.exists(pa)):
                print('makedirs:%s' % alink);
                os.makedirs('/tmp' + alink, mode=0o777 );
            ta=re.findall(r'<tbody class="stripes2">(.+)<\/tbody>',rest, re.S)
            ha = re.findall(r'href="(/jdk8u[\w\/\._]+)">(?!\[up\])', ta[0])
            # 递归爬取解析
            down_src_cur(ha)

down_src_cur(harr);

 


做文件搜索,替换:
4. 做简单代码验证
# 做简单字符查找验证
>>> '234234fdgdfs'.find('f')
6
>>> '234234fdgdfs'.index('f')
6
>>> '234234fdgdfs'[2:5]
'423'
# 做正则匹配
>>> re.findall(r'[a-zA-Z0-9]*\.[a-zA-Z1-9]*[\.|com]*', 'www.baidu.com')
['www.baidu.com']

 

5. 写个运维脚本,监听本机8080端口的运行状态,如果发现挂了,就发送邮件通知主人,并重启服务器。

#!/usr/bin/env python
#!coding=utf-8
import os
import time
import sys
import smtplib
from email.mime.text import MIMEText
 
def send_email (warning):
    msg = MIMEText(warning)
    msg['Subject'] = 'python send warning mail'
    msg['From'] = '测试了<rootrr@163.com>'
    try:
       smtp = smtplib.SMTP()
       to_mail = 'xx@163.com'
       from_mail = 'xx@163.com'
       smtp.connect(r'smtp.qiye.163.com')
       smtp.login('xx@163.com', 'xxx123')
       smtp.sendmail(from_mail, to_mail, msg.as_string())
       smtp.close()
       print('send mail to %s, content is: %s' % (to_mail, msg))
    except Exception as e:
       print("Send mail Error: %s" % e)
# 监听状态中。。。
while True:
    http_status = os.popen('netstat -tulnp | grep ":8080"','r').readlines()
    try:
        if http_status == []:
            os.system('service tomcat7 start')
        time.sleep(3)    # 等待启动
            new_http_status = os.popen('netstat -tulnp | grep ":8080"','r').readlines()
            str1 = ''.join(new_http_status)
            is_port = -1;
            send_email(warning = "8080 port shutdown, This is a warning!!!")  # 发送通知
        try:
              is_port = str1.split()[3].split(':')[-1]
        except IndexError, e:
          print("out of range:", e)
            if is_port != '8080':
                print 'tomcat 启动失败'
            else:
                print 'tomcat 启动成功'
        else:
            print '8080端口正常'
        time.sleep(5)
    except KeyboardInterrupt:
        sys.exit('out order\n')  

 

6. 科学计算,大数据,图形识别。。。  

  看工作需要!

 

以下命令为反向kill某个端口的服务

# netstat -tunlp | grep ':8080' | awk '{split($7, arr, "/"); print(arr[1])}' | kill -9 

 

 

 

 

 

转载于:https://www.cnblogs.com/yougewe/p/9454111.html


http://www.niftyadmin.cn/n/1255278.html

相关文章

ubuntu20.04+windows10_1909显卡直通(GPU Passthrough)

休息的时候看到了Nvidia放开了个人显卡在虚拟机里使用的操作权限&#xff0c;就花了点时间研究了下&#xff0c;最终的目的是能在win虚拟机里流畅地打游戏~ 这里记录下踩过的坑。 cpu支不支持虚拟化和你开没开虚拟化是俩玩意。网上的教程里都是让你敲命令检查cpu支不支持虚拟化…

llvm cookbook_LLVM寄存器分配(二)

原创内容&#xff0c;转载请注明出处。作者&#xff1a;汪岩1. 贪厌寄存器分配器工作原理和过程在前文《LLVM后端流程简介》中已经提到&#xff0c;和指令选择、指令调度一样&#xff0c;寄存器分配是编译器后端的一个重要组成部分&#xff0c;寄存器分配在后端流程中的位置如下…

数据库系统 事务管理_数据库管理系统中的事务

数据库系统 事务管理交易 (Transaction) A transaction is a logical unit of database processing that includes one or more database access operations such as an insertion, deletion, modification, and retrieval. Instructions are always atomic in nature i.e. eit…

【iOS】Category VS Extension 原理详解

http://www.cocoachina.com/ios/20170502/19163.html转载于:https://www.cnblogs.com/-WML-/p/9488848.html

PE——PE基础结构分析

PE结构 总体结构 IMAGE_NT_HEADERS包含了PE字样的签名&#xff0c;标准PE头IMAGE_FILE_HEADER和可选PE头IMAGE_OPTIONAL_HEADER。 //DOS头 typedef struct _IMAGE_DOS_HEADER {WORD e_magic; //“MZ标记”&#xff0c;用于判断是否位可执行文件WORD e_cblp;WORD e_cp;WORD…

cyclone iv 器件手册_精彩插页丨红外非线性光学晶体ZnGeP2和GaSe的 生长与器件研究...

——《人工晶体学报》讲好中国晶体的故事——除了封面图片以外&#xff0c;《人工晶体学报》还将随机安排精美插页&#xff0c;刊登国内研究团队最新生长的晶体照片供读者欣赏&#xff01;本期请欣赏哈尔滨工业大学杨春晖团队生长的ZnGeP2晶体和GaSe晶体。GaSe晶体与器件哈尔滨…

as_hash ruby_Ruby中带有示例的Hash.transform_keys方法

as_hash rubyHash.transform_keys方法 (Hash.transform_keys Method) In this article, we will study about Hash.transform_keys Method. The working of this method can be predicted with the help of its name but it is not as simple as it seems. Well, we will under…

使用按位与运算()巧妙实现求模运算(%)

static int indexFor(int h, int length) {return h & (length-1); }我们在进行hash相关的计算操作时&#xff0c;经常会涉及到求模运算&#xff0c;如上面的算哈希映射位置的代码。 前提&#xff1a;length是一个2的幂次方整数&#xff0c;这样&#xff0c;length-1 的二…