使用selenium+python做爬虫时遇到的元素定位问题

显示全部楼层 · 2021-1-27 06:48:27

最近在做基于selenium的浏览器自动化程序，爬取地理空间数据云的数据，遇到一个很奇怪的问题
通过xpath选择元素的时候，第一次运行（调试），没有问题，目标对象file_name和cloud_planty都可以取到值，退出后调试再次运行的时候，会报错，如下图

重启后再次调试，第一次没有问题，第二次有和上面一样的问题
大致知道问题出在这两行代码，但是不知道怎么解决

百度一下后说是再xpath定位时不能出现tbody，修改后发现无用
探索发现，可能和通过已定位的元素访问其他元素相关（调用element.find_element_by_****），使用css定位也会出现相关问题。我的编译环境为vs2019+python3.7+chrome79.0.3945.88
求专业人士解答
代码如下
fromseleniumimportwebdriver
fromselenium.webdriver.common.keysimportKeys
importre
importos
importshutil
importtime
importdatetime
file_path='d:\\Landsat8\\'
file_name_tail='.tar.gz.crdownload'
#检查文件是否开始完成
defis_download_started(file_name):
final_file_name=file_path+file_name+file_name_tail
returnos.path.exists(final_file_name)
#数据下载错误处理
defdown_error(file_name):
file_handle=open(file_path+'error_record.txt','a')
time1=datetime.datetime.now()
file_handle.write(file_name+''+datetime.datetime.strftime(time1,'%Y-%m-%d%H:%M:%S')+'\n')
file_handle.close()
#数据下载正确处理
defright_download(file_name):
file_handle=open(file_path+'right_record.txt','a')
time1=datetime.datetime.now()
file_handle.write(file_name+''+datetime.datetime.strftime(time1,'%Y-%m-%d%H:%M:%S')+'\n')
file_handle.close()
#移动文件
defmove_file(path,row,file_name):
final_file_path=file_path+file_name+file_name_tail
final_file_path=final_file_path[0:len(final_file_path)-11]
ifnotos.path.exists(file_path+str(path)+'_'+str(row)):
os.makedirs(file_path+str(path)+'_'+str(row))
shutil.move(final_file_path,file_path+str(path)+'_'+str(row))
#启动浏览器
options=webdriver.ChromeOptions()
prefs={'profile.default_content_settings.popups':0,'download.default_directory':file_path}
options.add_experimental_option('prefs',prefs)
wd=webdriver.Chrome(executable_path='D:\\software\\chromedriver.exe',chrome_options=options)
wd.implicitly_wait(10)#元素搜索不到时等十秒
#打开网页
wd.get('http://www.gscloud.cn/sources/?cdataid=263&pdataid=10')
#登录
wd.find_element_by_css_selector('[class="navnavbar-navnavbar-right"]>li').click()
wd.find_element_by_css_selector('[src="/static/img/accounts/qq_login_logo.png"]').click()
wd.switch_to.frame('ptlogin_iframe')
wd.find_element_by_css_selector('#switcher_plogin').click()
wd.find_element_by_css_selector('#u').send_keys('qq账号')
wd.find_element_by_css_selector('#p').send_keys('qq密码')
wd.find_element_by_css_selector('#login_button').click()
#选择landsat8数据
wd.find_element_by_css_selector('[src="/static/img/index/1_03.png"]').click()
wd.find_element_by_css_selector('[href="/sources/list_dataset/411?cdataid=263&pdataid=10&datatype=OLI_TIRS"]').click()
paths=[122,123,124,125,126]
rows=[37,38,39,40]#行列号
forpathinpaths:
forrowinrows:
page_num=1
#选取行列号
p=wd.find_element_by_css_selector('[title="二次筛选"][filtername="path"]')
p.clear()
p.send_keys(path)
r=wd.find_element_by_css_selector('[title="二次筛选"][filtername="row"]')
r.clear()
r.send_keys(row)
time.sleep(1)
wd.find_element_by_css_selector('[title="二次筛选"][filtername="row"]').send_keys(Keys.ENTER)
sum_pages=re.findall(r'\d+',wd.find_element_by_css_selector('[style="padding-right:6px;"]').text)
#e=wd.find_element_by_xpath('//tbody/tr[3]//a[@title="下载"]/../../../preceding-sibling::*//div[@style="word-break:break-all"]')
#print(e.text)
while(int(sum_pages[0])>=page_num):
#下载
elements=wd.find_elements_by_css_selector('tbody[title="下载"]')#css选择
#elements=wd.find_elements_by_css_selector('a[title="下载"]')#css选择
#elements=wd.find_elements_by_xpath('//a[@title="下载"]')#xpath选择/preceding-sibling::*//div[@style="word-break:break-all"]
forelementinelements:
file_name=element.find_element_by_xpath('./../../../preceding-sibling::*//div[@style="word-break:break-all"]').text
cloud_planty=element.find_element_by_xpath('./../../../preceding-sibling::*[5]/div').text
ifcloud_planty!=''andfloat(cloud_planty)
分 -->